Learning by doing: Reading books and trying to understand the (code) examples
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

40 lines
1.3 KiB

  1. from urllib.request import urlopen
  2. from urllib.parse import urlparse
  3. import re
  4. import sys
  5. LINK_REGEX = re.compile("<a [^>]*href=['\"]([^\"]+)['\"][^>]*>")
  6. class LinkCollector:
  7. def __init__(self, url):
  8. self.url = "http://" + urlparse(url).netloc
  9. self.collected_links = set()
  10. self.visited_links = set()
  11. def collect_links(self, path="/"):
  12. full_url = self.url + path
  13. self.visited_links.add(full_url)
  14. page = str(urlopen(full_url).read())
  15. links = LINK_REGEX.findall(page)
  16. links = {self.normalize_url(path, link) for link in links}
  17. self.collected_links = links.union(self.collected_links)
  18. self.unvisited_links = links.difference(self.visited_links)
  19. for link in self.unvisited_links:
  20. if link.startswith(self.url):
  21. self.collect_links(urlparse(link).path)
  22. def normalize_url(self, path, link):
  23. if link.startswith("http://") or link.startswith("https://"):
  24. return link
  25. elif link.startswith("/"):
  26. return self.url + link
  27. else:
  28. return self.url + path.rpartition('/')[0] + '/' + link
  29. if __name__ == "__main__":
  30. collector = LinkCollector(sys.argv[1])
  31. collector.collect_links()
  32. for link in collector.collected_links:
  33. print(link)