| @ -0,0 +1,40 @@ | |||||
| from urllib.request import urlopen | |||||
| from urllib.parse import urlparse | |||||
| import re | |||||
| import sys | |||||
| LINK_REGEX = re.compile("<a [^>]*href=['\"]([^\"]+)['\"][^>]*>") | |||||
| class LinkCollector: | |||||
| def __init__(self, url): | |||||
| self.url = "http://" + urlparse(url).netloc | |||||
| self.collected_links = set() | |||||
| self.visited_links = set() | |||||
| def collect_links(self, path="/"): | |||||
| full_url = self.url + path | |||||
| self.visited_links.add(full_url) | |||||
| page = str(urlopen(full_url).read()) | |||||
| links = LINK_REGEX.findall(page) | |||||
| links = {self.normalize_url(path, link) for link in links} | |||||
| self.collected_links = links.union(self.collected_links) | |||||
| self.unvisited_links = links.difference(self.visited_links) | |||||
| for link in self.unvisited_links: | |||||
| if link.startswith(self.url): | |||||
| self.collect_links(urlparse(link).path) | |||||
| def normalize_url(self, path, link): | |||||
| if link.startswith("http://") or link.startswith("https://"): | |||||
| return link | |||||
| elif link.startswith("/"): | |||||
| return self.url + link | |||||
| else: | |||||
| return self.url + path.rpartition('/')[0] + '/' + link | |||||
| if __name__ == "__main__": | |||||
| collector = LinkCollector(sys.argv[1]) | |||||
| collector.collect_links() | |||||
| for link in collector.collected_links: | |||||
| print(link) | |||||