Skip to content

Commit

Permalink
Pull main branch changes Merge branch 'main' of https://github.com/co…
Browse files Browse the repository at this point in the history
  • Loading branch information
s-saloni committed Sep 24, 2021
2 parents 519152e + 7956b63 commit 78fe3f8
Show file tree
Hide file tree
Showing 3 changed files with 102 additions and 14 deletions.
78 changes: 72 additions & 6 deletions Legistar_scraper/Gdrive_upload_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,77 @@ def drive_launch():



def drive_upload(path, drive):
folder = drive.ListFile({'q': "title = 'San Jose' and trashed=false"}).GetList()[0]
# this is set only for San Jose for now, but we will need to make it dynamic for other cities in the future
for x in os.listdir(path):
def get_city_folder(drive, current_city='sanjose'):
"""Creates a city folder if one does not exist and returns a city folder
under the shared drive 'Cities'
returns: GoogleDriveFile object """

SHARED_FOLDER_NAME = 'Cities'
shared_query = "title = '{}' and sharedWithMe and trashed=false".format(SHARED_FOLDER_NAME)

# List the (shared) Cities folder
shared_folder_list = drive.ListFile({'q': shared_query}).GetList()

# Next - list all files under the shared Cities folder
# Build a new querry for the same
folder_query = "'{}' in parents and trashed=false".format(shared_folder_list[0]['id'])
city_list = drive.ListFile({'q': folder_query}).GetList()

# city_list above now contains a list of GoogleDriveFile objects
# Iterate through the city list to see if the folder for current_city exists.
# If not - first create the folder
# Subsequently - the folder now exists - tiehr it already existed - or a folder by that name was just created
# either ways - the folder now exists
# Upload to the folder

folder = None
for index, a_city in enumerate(city_list):
if a_city['title'] == current_city:
folder = city_list[index]
break

# Create the city if it does not exist
if folder is None:
# No city by the name exists
folder_query = "title = '{}' and trashed=false".format(SHARED_FOLDER_NAME)
parent_folder = drive.ListFile({'q': folder_query}).GetList()

folder_name = current_city
folder = drive.CreateFile({'title' : folder_name,
'mimeType' : 'application/vnd.google-apps.folder',
'parents': [{'id': parent_folder[0]['id']}]})
folder.Upload()

# A folder now exists - upload the csv
return folder


def upload_files(folder, full_path, drive):
"""Upload all files in the folder indicated by full_path (on the workstation)
to the Google Drive folder identified by folder. Note that the files in full_path
likely should be deleted - or moved out - so that the same files are not uploaded
subsequently"""
for x in os.listdir(full_path):
f = drive.CreateFile({'title': x, 'parents': [{'id': folder['id']}]})
f.SetContentFile(os.path.join(path, x))
f.SetContentFile(os.path.join(full_path, x))
f.Upload()
f = None





def drive_upload(full_path, drive, current_city = 'sanjose'):

# The Cities folder is shared with the account that was chosen when the script is run
# NOTE: This login required that you choose your google account that has the Cities folder shared
# In other words, the code below as is will likely not work if you are logged in to the
# agendascraper@gmail.com directly. If we do need to support that scenario - that of having
# logged directly into agendascraper@gmail.com account, then the logic below will need to be
# enhanced.
#
folder = get_city_folder(drive, current_city)

# Upload the csv. The below may need to be revised/refactoed as once the fiels are uploaded, maybe the
# files in the fodler hshould be deleted? Tat way - at the next run - the same files are not uploaded
# again
upload_files(folder, full_path, drive)
38 changes: 30 additions & 8 deletions Legistar_scraper/Legistar_Selenium.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,14 @@
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
import pandas as pd
from dotenv import load_dotenv
from Gdrive_upload_methods import drive_upload, drive_launch

load_dotenv()
full_path = os.environ.get('full_path')

drive = drive_launch()
current_city = sys.argv[1]
time_period = sys.argv[2]
Expand All @@ -30,10 +33,15 @@
session = HTMLSession()


def scrape_meetings(url):

driver = webdriver.Firefox()
driver.get("https://%s.legistar.com/Calendar.aspx" %(url))

def get_link():

# Set up options to launch Firefox
options = Options()
options.headless = True
driver = webdriver.Firefox(options=options)
driver.get("https://%s.legistar.com/Calendar.aspx" %(current_city))
WebDriverWait(driver, 10000).until(EC.presence_of_element_located(
(By.ID, 'ctl00_ContentPlaceHolder1_tdYears')))
driver.find_element_by_id('ctl00_ContentPlaceHolder1_tdYears').click()
Expand Down Expand Up @@ -81,10 +89,12 @@ def scrape_meetings(url):
link = cells[Meeting_Details_index].find_element_by_link_text('Meeting details').get_attribute('href')

if link is not None:
break


get_agenda(link, Meeting_Date, Name)
driver.close()
return(link, Meeting_Date, Name)



def get_agenda(link, meeting_date, meeting_name):
print(link)
Expand Down Expand Up @@ -122,8 +132,20 @@ def get_agenda(link, meeting_date, meeting_name):
continue

agenda_table['Staff Report link'] = data
agenda_table.to_csv((full_path + meeting_name + '_' + meeting_date + '.csv'), index=False, errors='replace')
return agenda_table




# Get the URL
link, meeting_date, meeting_name = get_link()

agenda_table = get_agenda(link, meeting_date, meeting_name)

# Generate the appropriate csv
agenda_table.to_csv((full_path + meeting_name + '_' + meeting_date + '.csv'), index=False, errors='replace')
# Creates filename with 'Meeting Name' and 'Date'
drive_upload(full_path, drive)

scrape_meetings(current_city)
# Now that we have the csv - upload to Google drive
drive_upload(full_path, drive, current_city)

Binary file not shown.

0 comments on commit 78fe3f8

Please sign in to comment.