-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBDA_scraper_utils.py
128 lines (104 loc) · 4.51 KB
/
BDA_scraper_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import os
import pandas as pd
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, ElementClickInterceptedException
from selenium_stealth import stealth
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
def configure_webdriver():
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument('--log-level=1')
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
# Initialize the Chrome WebDriver
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
stealth(driver,
languages=["en-US", "en"],
vendor="Google Inc.",
platform="Win32",
webgl_vendor="Intel Inc.",
renderer="Intel Iris OpenGL Engine",
fix_hairline=True,
)
return driver
def scrape_job_data(driver, Job_Classification, location):
df = pd.DataFrame(columns=['Link', 'Job Title', 'Job Classification', 'Location', 'Company'])
url = 'https://jobs.boeing.com/category/boeing-defence-australia-jobs/185-18469/2681/1'
driver.get(url)
print(f"Scraping {url}")
try:
# Locate and click on the "Show All" link
show_all_link = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//a[@class="pagination-show-all"]')))
driver.execute_script("arguments[0].click();", show_all_link)
# Wait for a fixed time (e.g., 5 seconds) to allow the page to load
print("Clicked 'Show All' and waiting for page to load.")
time.sleep(5) # Adjust the sleep time as necessary
except TimeoutException:
print("Timeout waiting for 'Show All' link to be clickable")
return df
except NoSuchElementException:
print("'Show All' link not found")
return df
except ElementClickInterceptedException:
print("Unable to click 'Show All' link, it may be obscured")
return df
except Exception as e:
print(f"Unexpected error when clicking 'Show All': {e}")
return df
soup = BeautifulSoup(driver.page_source, 'lxml')
job_listings = soup.find_all('li', class_='no-security-clearance')
if not job_listings:
print("No job listings found.")
return df
for job in job_listings:
try:
link_element = job.find('a', class_='search-results__job-link')
if not link_element:
continue
link = link_element.get('href')
if not link:
continue
link_full = 'https://jobs.boeing.com' + link
job_title = job.find('span', class_='search-results__job-title').text.strip()
company = 'BDA'
location_element = job.find('span', class_='search-results__job-info location')
location = location_element.text.strip() if location_element else ''
print(f"Scraped job: {job_title} - {location}")
new_data = pd.DataFrame({
'Link': [link_full],
'Job Title': [job_title],
'Job Classification': [Job_Classification],
'Location': [location],
'Company': [company]
})
df = pd.concat([df, new_data], ignore_index=True)
except Exception as e:
print(f"Error scraping job: {e}")
return df
# Create the .csv_files directory if it doesn't exist
output_dir = './csv_files'
if not os.path.exists(output_dir):
os.makedirs(output_dir)
def save_df_to_csv(df, output_dir):
# Ensure the directory exists
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# Define the file path for the CSV
file_path = os.path.join(output_dir, 'BDA_job_data.csv')
# Save the DataFrame to a CSV file
df.to_csv(file_path, index=False)
print(f"Data saved to {file_path}")
# Main execution
if __name__ == "__main__":
driver = configure_webdriver()
try:
df = scrape_job_data(driver, 'Engineering', 'Australia')
save_df_to_csv(df, output_dir)
finally:
driver.quit()