added case study of chapter 6

11 years ago · 2e19992f15
--- a/python_3_oop/chapter06/linkcollector.py
+++ b/python_3_oop/chapter06/linkcollector.py
@ -0,0 +1,40 @@
 from urllib.request import urlopen
 from urllib.parse import urlparse
 import re
 import sys

 LINK_REGEX = re.compile("<a [^>]*href=['\"]([^\"]+)['\"][^>]*>")


 class LinkCollector:

    def __init__(self, url):
        self.url = "http://" + urlparse(url).netloc
        self.collected_links = set()
        self.visited_links = set()

    def collect_links(self, path="/"):
        full_url = self.url + path
        self.visited_links.add(full_url)
        page = str(urlopen(full_url).read())
        links = LINK_REGEX.findall(page)
        links = {self.normalize_url(path, link) for link in links}
        self.collected_links = links.union(self.collected_links)
        self.unvisited_links = links.difference(self.visited_links)
        for link in self.unvisited_links:
            if link.startswith(self.url):
                self.collect_links(urlparse(link).path)

    def normalize_url(self, path, link):
        if link.startswith("http://") or link.startswith("https://"):
            return link
        elif link.startswith("/"):
            return self.url + link
        else:
            return self.url + path.rpartition('/')[0] + '/' + link

 if __name__ == "__main__":
    collector = LinkCollector(sys.argv[1])
    collector.collect_links()
    for link in collector.collected_links:
        print(link)