-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathatomparse.py
30 lines (23 loc) · 852 Bytes
/
atomparse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import requests
import sys
import xml.etree.ElementTree as ET
from urllib.parse import urljoin
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# for dumping just HTML links from atom XML
# get url if no arg
if len(sys.argv) > 1:
url = sys.argv[1]
else:
# Ask the user for the URL
url = input("Please enter the URL: ")
# get and parse
response = requests.get(url,verify=False)
root = ET.fromstring(response.content)
# Find all 'atom:link' elements with a 'type' attribute of 'text/html'
links = root.findall(".//atom:link[@type='text/html']", namespaces={'atom': 'http://www.w3.org/2005/Atom'})
# Extract the 'href' attribute from each and print as new URLS
hrefs = [link.get('href') for link in links]
hrefs = [urljoin(url, link.get('href')) for link in links]
for href in hrefs:
print(href)