Browse Source

added case study of chapter 6

master
T. Meissner 10 years ago
parent
commit
2e19992f15
1 changed files with 40 additions and 0 deletions
  1. +40
    -0
      python_3_oop/chapter06/linkcollector.py

+ 40
- 0
python_3_oop/chapter06/linkcollector.py View File

@ -0,0 +1,40 @@
from urllib.request import urlopen
from urllib.parse import urlparse
import re
import sys
LINK_REGEX = re.compile("<a [^>]*href=['\"]([^\"]+)['\"][^>]*>")
class LinkCollector:
def __init__(self, url):
self.url = "http://" + urlparse(url).netloc
self.collected_links = set()
self.visited_links = set()
def collect_links(self, path="/"):
full_url = self.url + path
self.visited_links.add(full_url)
page = str(urlopen(full_url).read())
links = LINK_REGEX.findall(page)
links = {self.normalize_url(path, link) for link in links}
self.collected_links = links.union(self.collected_links)
self.unvisited_links = links.difference(self.visited_links)
for link in self.unvisited_links:
if link.startswith(self.url):
self.collect_links(urlparse(link).path)
def normalize_url(self, path, link):
if link.startswith("http://") or link.startswith("https://"):
return link
elif link.startswith("/"):
return self.url + link
else:
return self.url + path.rpartition('/')[0] + '/' + link
if __name__ == "__main__":
collector = LinkCollector(sys.argv[1])
collector.collect_links()
for link in collector.collected_links:
print(link)

Loading…
Cancel
Save