From 2e19992f15eb23de912d5481512c3502858daae2 Mon Sep 17 00:00:00 2001
From: tmeissner <programming@goodcleanfun.de>
Date: Wed, 31 Dec 2014 11:28:14 +0100
Subject: [PATCH] added case study of chapter 6

---
 python_3_oop/chapter06/linkcollector.py | 40 +++++++++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 python_3_oop/chapter06/linkcollector.py
diff --git a/python_3_oop/chapter06/linkcollector.py b/python_3_oop/chapter06/linkcollector.py
new file mode 100644
index 0000000..1e3dddf
--- /dev/null
+++ b/python_3_oop/chapter06/linkcollector.py
@@ -0,0 +1,40 @@
+from urllib.request import urlopen
+from urllib.parse import urlparse
+import re
+import sys
+
+LINK_REGEX = re.compile("<a [^>]*href=['\"]([^\"]+)['\"][^>]*>")
+
+
+class LinkCollector:
+
+    def __init__(self, url):
+        self.url = "http://" + urlparse(url).netloc
+        self.collected_links = set()
+        self.visited_links = set()
+
+    def collect_links(self, path="/"):
+        full_url = self.url + path
+        self.visited_links.add(full_url)
+        page = str(urlopen(full_url).read())
+        links = LINK_REGEX.findall(page)
+        links = {self.normalize_url(path, link) for link in links}
+        self.collected_links = links.union(self.collected_links)
+        self.unvisited_links = links.difference(self.visited_links)
+        for link in self.unvisited_links:
+            if link.startswith(self.url):
+                self.collect_links(urlparse(link).path)
+
+    def normalize_url(self, path, link):
+        if link.startswith("http://") or link.startswith("https://"):
+            return link
+        elif link.startswith("/"):
+            return self.url + link
+        else:
+            return self.url + path.rpartition('/')[0] + '/' + link
+
+if __name__ == "__main__":
+    collector = LinkCollector(sys.argv[1])
+    collector.collect_links()
+    for link in collector.collected_links:
+        print(link)