Learning by doing: Reading books and trying to understand the (code) examples
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

40 lines
1.3 KiB

from urllib.request import urlopen
from urllib.parse import urlparse
import re
import sys
LINK_REGEX = re.compile("<a [^>]*href=['\"]([^\"]+)['\"][^>]*>")
class LinkCollector:
def __init__(self, url):
self.url = "http://" + urlparse(url).netloc
self.collected_links = set()
self.visited_links = set()
def collect_links(self, path="/"):
full_url = self.url + path
self.visited_links.add(full_url)
page = str(urlopen(full_url).read())
links = LINK_REGEX.findall(page)
links = {self.normalize_url(path, link) for link in links}
self.collected_links = links.union(self.collected_links)
self.unvisited_links = links.difference(self.visited_links)
for link in self.unvisited_links:
if link.startswith(self.url):
self.collect_links(urlparse(link).path)
def normalize_url(self, path, link):
if link.startswith("http://") or link.startswith("https://"):
return link
elif link.startswith("/"):
return self.url + link
else:
return self.url + path.rpartition('/')[0] + '/' + link
if __name__ == "__main__":
collector = LinkCollector(sys.argv[1])
collector.collect_links()
for link in collector.collected_links:
print(link)