-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_text.py
52 lines (41 loc) · 2.15 KB
/
get_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
def extract_content(driver):
# Get the current page HTML source
html_source = driver.page_source
# Define the start and end markers for the content extraction
start_marker = '<h2>'
end_marker = '<div id="favoriteshow_3" style="display:none" align="center"></div>'
# Find the start and end positions of the content to be extracted
start_pos = html_source.find(start_marker)
end_pos = html_source.find(end_marker, start_pos)
# Extract the content between the start and end markers
if start_pos != -1 and end_pos != -1:
content = html_source[start_pos:end_pos]
# Optionally, clean or process the content as needed here
# Replace all <br> and </div> tags with newline characters
content = content.replace('<br>', '\n')
# Remove </div>, <h2>, and </h2> tags
content = content.replace('</div>', '').replace('<h2>', '').replace('</h2>', '')
# Remove <div style="clear:both;"> if it exists
content = content.replace('<div style="clear:both;">', '')
# Write the extracted content to a file
with open('extracted_content.txt', 'a', encoding='utf-8') as file:
file.write(content + "\n\n")
def main(total_chapters):
# Setup the WebDriver (ensure you have the WebDriver for your browser installed)
driver = webdriver.Chrome() # or use `webdriver.Firefox()`, etc., depending on your browser
# Replace this URL with the URL of the page you want to scrape
url = 'the_website's_url'
driver.get(url)
for _ in range(total_chapters):
extract_content(driver)
# Simulate pressing the right arrow key to navigate
driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.RIGHT)
time.sleep(10) # Sleep to avoid being detected as a robot and allow the page to load
driver.quit()
if __name__ == "__main__":
total_chapters = 51 # Example: Replace 100 with the actual total chapter count
main(total_chapters)