Learning by doing: Reading books and trying to understand the (code) examples

40 lines
1.3 KiB

  1. from urllib.request import urlopen
  2. from urllib.parse import urlparse
  3. import re
  4. import sys
  5. LINK_REGEX = re.compile("<a [^>]*href=['\"]([^\"]+)['\"][^>]*>")
  6. class LinkCollector:
  7. def __init__(self, url):
  8. self.url = "http://" + urlparse(url).netloc
  9. self.collected_links = set()
  10. self.visited_links = set()
  11. def collect_links(self, path="/"):
  12. full_url = self.url + path
  13. self.visited_links.add(full_url)
  14. page = str(urlopen(full_url).read())
  15. links = LINK_REGEX.findall(page)
  16. links = {self.normalize_url(path, link) for link in links}
  17. self.collected_links = links.union(self.collected_links)
  18. self.unvisited_links = links.difference(self.visited_links)
  19. for link in self.unvisited_links:
  20. if link.startswith(self.url):
  21. self.collect_links(urlparse(link).path)
  22. def normalize_url(self, path, link):
  23. if link.startswith("http://") or link.startswith("https://"):
  24. return link
  25. elif link.startswith("/"):
  26. return self.url + link
  27. else:
  28. return self.url + path.rpartition('/')[0] + '/' + link
  29. if __name__ == "__main__":
  30. collector = LinkCollector(sys.argv[1])
  31. collector.collect_links()
  32. for link in collector.collected_links:
  33. print(link)