tmeissner
/
learning-by-doing


								from urllib.request import urlopen

								from urllib.parse import urlparse

								import re

								import sys


								LINK_REGEX = re.compile("<a [^>]*href=['\"]([^\"]+)['\"][^>]*>")


								class LinkCollector:


								    def __init__(self, url):

								        self.url = "http://" + urlparse(url).netloc

								        self.collected_links = set()

								        self.visited_links = set()


								    def collect_links(self, path="/"):

								        full_url = self.url + path

								        self.visited_links.add(full_url)

								        page = str(urlopen(full_url).read())

								        links = LINK_REGEX.findall(page)

								        links = {self.normalize_url(path, link) for link in links}

								        self.collected_links = links.union(self.collected_links)

								        self.unvisited_links = links.difference(self.visited_links)

								        for link in self.unvisited_links:

								            if link.startswith(self.url):

								                self.collect_links(urlparse(link).path)


								    def normalize_url(self, path, link):

								        if link.startswith("http://") or link.startswith("https://"):

								            return link

								        elif link.startswith("/"):

								            return self.url + link

								        else:

								            return self.url + path.rpartition('/')[0] + '/' + link


								if __name__ == "__main__":

								    collector = LinkCollector(sys.argv[1])

								    collector.collect_links()

								    for link in collector.collected_links:

								        print(link)