|
@ -0,0 +1,40 @@ |
|
|
|
|
|
from urllib.request import urlopen |
|
|
|
|
|
from urllib.parse import urlparse |
|
|
|
|
|
import re |
|
|
|
|
|
import sys |
|
|
|
|
|
|
|
|
|
|
|
LINK_REGEX = re.compile("<a [^>]*href=['\"]([^\"]+)['\"][^>]*>") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class LinkCollector: |
|
|
|
|
|
|
|
|
|
|
|
def __init__(self, url): |
|
|
|
|
|
self.url = "http://" + urlparse(url).netloc |
|
|
|
|
|
self.collected_links = set() |
|
|
|
|
|
self.visited_links = set() |
|
|
|
|
|
|
|
|
|
|
|
def collect_links(self, path="/"): |
|
|
|
|
|
full_url = self.url + path |
|
|
|
|
|
self.visited_links.add(full_url) |
|
|
|
|
|
page = str(urlopen(full_url).read()) |
|
|
|
|
|
links = LINK_REGEX.findall(page) |
|
|
|
|
|
links = {self.normalize_url(path, link) for link in links} |
|
|
|
|
|
self.collected_links = links.union(self.collected_links) |
|
|
|
|
|
self.unvisited_links = links.difference(self.visited_links) |
|
|
|
|
|
for link in self.unvisited_links: |
|
|
|
|
|
if link.startswith(self.url): |
|
|
|
|
|
self.collect_links(urlparse(link).path) |
|
|
|
|
|
|
|
|
|
|
|
def normalize_url(self, path, link): |
|
|
|
|
|
if link.startswith("http://") or link.startswith("https://"): |
|
|
|
|
|
return link |
|
|
|
|
|
elif link.startswith("/"): |
|
|
|
|
|
return self.url + link |
|
|
|
|
|
else: |
|
|
|
|
|
return self.url + path.rpartition('/')[0] + '/' + link |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
collector = LinkCollector(sys.argv[1]) |
|
|
|
|
|
collector.collect_links() |
|
|
|
|
|
for link in collector.collected_links: |
|
|
|
|
|
print(link) |