Browse Source

added case study of chapter 6

T. Meissner 5 years ago
parent
commit
2e19992f15
1 changed files with 40 additions and 0 deletions
  1. 40
    0
      python_3_oop/chapter06/linkcollector.py

+ 40
- 0
python_3_oop/chapter06/linkcollector.py View File

@@ -0,0 +1,40 @@
1
+from urllib.request import urlopen
2
+from urllib.parse import urlparse
3
+import re
4
+import sys
5
+
6
+LINK_REGEX = re.compile("<a [^>]*href=['\"]([^\"]+)['\"][^>]*>")
7
+
8
+
9
+class LinkCollector:
10
+
11
+    def __init__(self, url):
12
+        self.url = "http://" + urlparse(url).netloc
13
+        self.collected_links = set()
14
+        self.visited_links = set()
15
+
16
+    def collect_links(self, path="/"):
17
+        full_url = self.url + path
18
+        self.visited_links.add(full_url)
19
+        page = str(urlopen(full_url).read())
20
+        links = LINK_REGEX.findall(page)
21
+        links = {self.normalize_url(path, link) for link in links}
22
+        self.collected_links = links.union(self.collected_links)
23
+        self.unvisited_links = links.difference(self.visited_links)
24
+        for link in self.unvisited_links:
25
+            if link.startswith(self.url):
26
+                self.collect_links(urlparse(link).path)
27
+
28
+    def normalize_url(self, path, link):
29
+        if link.startswith("http://") or link.startswith("https://"):
30
+            return link
31
+        elif link.startswith("/"):
32
+            return self.url + link
33
+        else:
34
+            return self.url + path.rpartition('/')[0] + '/' + link
35
+
36
+if __name__ == "__main__":
37
+    collector = LinkCollector(sys.argv[1])
38
+    collector.collect_links()
39
+    for link in collector.collected_links:
40
+        print(link)