diff --git a/python_3_oop/chapter06/linkcollector.py b/python_3_oop/chapter06/linkcollector.py new file mode 100644 index 0000000..1e3dddf --- /dev/null +++ b/python_3_oop/chapter06/linkcollector.py @@ -0,0 +1,40 @@ +from urllib.request import urlopen +from urllib.parse import urlparse +import re +import sys + +LINK_REGEX = re.compile("]*href=['\"]([^\"]+)['\"][^>]*>") + + +class LinkCollector: + + def __init__(self, url): + self.url = "http://" + urlparse(url).netloc + self.collected_links = set() + self.visited_links = set() + + def collect_links(self, path="/"): + full_url = self.url + path + self.visited_links.add(full_url) + page = str(urlopen(full_url).read()) + links = LINK_REGEX.findall(page) + links = {self.normalize_url(path, link) for link in links} + self.collected_links = links.union(self.collected_links) + self.unvisited_links = links.difference(self.visited_links) + for link in self.unvisited_links: + if link.startswith(self.url): + self.collect_links(urlparse(link).path) + + def normalize_url(self, path, link): + if link.startswith("http://") or link.startswith("https://"): + return link + elif link.startswith("/"): + return self.url + link + else: + return self.url + path.rpartition('/')[0] + '/' + link + +if __name__ == "__main__": + collector = LinkCollector(sys.argv[1]) + collector.collect_links() + for link in collector.collected_links: + print(link)