Pull main branch changes Merge branch 'main' of https://github.com/co…

…deforsanjose/city-agenda-scraper into gh-41-spacy
codeforsanjose · Sep 24, 2021 · 78fe3f8 · 78fe3f8
2 parents 519152e + 7956b63
commit 78fe3f8
Show file tree

Hide file tree

Showing 3 changed files with 102 additions and 14 deletions.
diff --git a/Legistar_scraper/Gdrive_upload_methods.py b/Legistar_scraper/Gdrive_upload_methods.py
@@ -13,11 +13,77 @@ def drive_launch():
 
 
 
-def drive_upload(path, drive): 
-    folder = drive.ListFile({'q': "title = 'San Jose' and trashed=false"}).GetList()[0]
-    # this is set only for San Jose for now, but we will need to make it dynamic for other cities in the future
-    for x in os.listdir(path):
+def get_city_folder(drive, current_city='sanjose'):
+    """Creates a city folder if one does not exist and returns a city folder
+    under the shared drive 'Cities'
+    returns: GoogleDriveFile object """
+
+    SHARED_FOLDER_NAME = 'Cities'
+    shared_query = "title = '{}' and sharedWithMe and trashed=false".format(SHARED_FOLDER_NAME)
+
+    # List the (shared) Cities folder
+    shared_folder_list = drive.ListFile({'q': shared_query}).GetList()
+
+    # Next - list all files under the shared Cities folder
+    # Build a new querry for the same
+    folder_query = "'{}' in parents and trashed=false".format(shared_folder_list[0]['id'])
+    city_list = drive.ListFile({'q': folder_query}).GetList()
+
+    # city_list above now contains a list of GoogleDriveFile objects
+    # Iterate through the city list to see if the folder for current_city exists.
+    # If not - first create the folder
+    # Subsequently - the folder now exists - tiehr it already existed - or a folder by that name was just created
+    # either ways - the folder now exists
+    # Upload to the folder
+
+    folder = None
+    for index, a_city in enumerate(city_list):
+        if a_city['title'] == current_city:
+            folder = city_list[index]
+            break
+
+    # Create the city if it does not exist
+    if folder is None:
+        # No city by the name exists
+        folder_query = "title = '{}' and trashed=false".format(SHARED_FOLDER_NAME)
+        parent_folder = drive.ListFile({'q': folder_query}).GetList()
+
+        folder_name = current_city
+        folder = drive.CreateFile({'title' : folder_name, 
+                               'mimeType' : 'application/vnd.google-apps.folder',
+                               'parents': [{'id': parent_folder[0]['id']}]})
+        folder.Upload()
+
+    # A folder now exists - upload the csv
+    return folder
+
+
+def upload_files(folder, full_path, drive):
+    """Upload all files in the folder indicated by full_path (on the workstation)
+    to the Google Drive folder identified by folder. Note that the files in full_path
+    likely should be deleted - or moved out - so that the same files are not uploaded
+    subsequently"""
+    for x in os.listdir(full_path):
         f = drive.CreateFile({'title': x, 'parents': [{'id': folder['id']}]})
-        f.SetContentFile(os.path.join(path, x))
+        f.SetContentFile(os.path.join(full_path, x))
         f.Upload()
-    f = None
+
+
+
+
+
+def drive_upload(full_path, drive, current_city = 'sanjose'):
+
+    # The Cities folder is shared with the account that was chosen when the script is run
+    # NOTE: This login required that you choose your google account that has the Cities folder shared
+    # In other words, the code below as is will likely not work if you are logged in to the
+    # agendascraper@gmail.com directly. If we do need to support that scenario - that of having
+    # logged directly into agendascraper@gmail.com account, then the logic below will need to be
+    # enhanced. 
+    # 
+    folder = get_city_folder(drive, current_city)
+
+    # Upload the csv. The below may need to be revised/refactoed as once the fiels are uploaded, maybe the
+    # files in the fodler hshould be deleted? Tat way - at the next run - the same files are not uploaded
+    # again
+    upload_files(folder, full_path, drive)
diff --git a/Legistar_scraper/Legistar_Selenium.py b/Legistar_scraper/Legistar_Selenium.py
@@ -17,11 +17,14 @@
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.common.by import By
+from selenium.webdriver.firefox.options import Options
 import pandas as pd
 from dotenv import load_dotenv
 from Gdrive_upload_methods import drive_upload, drive_launch
 
+load_dotenv()
 full_path = os.environ.get('full_path')
+
 drive = drive_launch()
 current_city = sys.argv[1]
 time_period = sys.argv[2]
@@ -30,10 +33,15 @@
 session = HTMLSession()
 
 
-def scrape_meetings(url):
 
-    driver = webdriver.Firefox()
-    driver.get("https://%s.legistar.com/Calendar.aspx" %(url))
+
+def get_link():
+
+    # Set up options to launch Firefox
+    options = Options()
+    options.headless = True
+    driver = webdriver.Firefox(options=options)
+    driver.get("https://%s.legistar.com/Calendar.aspx" %(current_city))
     WebDriverWait(driver, 10000).until(EC.presence_of_element_located(
         (By.ID, 'ctl00_ContentPlaceHolder1_tdYears')))
     driver.find_element_by_id('ctl00_ContentPlaceHolder1_tdYears').click()
@@ -81,10 +89,12 @@ def scrape_meetings(url):
         link = cells[Meeting_Details_index].find_element_by_link_text('Meeting details').get_attribute('href')
 
         if link is not None:
+            break
 
-
-            get_agenda(link, Meeting_Date, Name)
     driver.close()
+    return(link, Meeting_Date, Name)
+
+
 
 def get_agenda(link, meeting_date, meeting_name):
     print(link)
@@ -122,8 +132,20 @@ def get_agenda(link, meeting_date, meeting_name):
                 continue
 
     agenda_table['Staff Report link'] = data
-    agenda_table.to_csv((full_path + meeting_name + '_' + meeting_date + '.csv'), index=False, errors='replace')
+    return agenda_table
+
+
+
+
+# Get the URL
+link, meeting_date, meeting_name = get_link()
+
+agenda_table = get_agenda(link, meeting_date, meeting_name)
+
+# Generate the appropriate csv
+agenda_table.to_csv((full_path + meeting_name + '_' + meeting_date + '.csv'), index=False, errors='replace')
 # Creates filename with 'Meeting Name' and 'Date'
-    drive_upload(full_path, drive)
 
-scrape_meetings(current_city)
+# Now that we have the csv - upload to Google drive
+drive_upload(full_path, drive, current_city)
+
diff --git a/Legistar_scraper/__pycache__/Gdrive_upload_methods.cpython-39.pyc b/Legistar_scraper/__pycache__/Gdrive_upload_methods.cpython-39.pyc