Решил на днях освоить Питон, написал уже пару скриптов. Второй скрипт ищет путь от одной веб-страницы к другой по ссылкам. Прошу посмотреть и сказать, что бы вы поправили по стилю, и что - с учётом внутренней работы интерпретатора.
Код:
#!/usr/bin/env python3
# Searches the shortest path of links between two web pages
# Usage: crawler.py search-depth url-from url-to
# Example: crawler.py 3 https://en.wikipedia.org/wiki/Python http://python.org
from sys import argv
from collections import deque
from html.parser import HTMLParser
from urllib.request import urlopen
from urllib import parse
import re
# Parse HTML, return links only
class LinkParser(HTMLParser):
def handle_starttag(self, tag, attributes):
if tag != 'a':
return
for (key, value) in attributes:
if key == 'href':
new_url = parse.urljoin(self.base_url, value)
self.links.append(new_url)
return
def get_links(self, url):
self.links = []
self.base_url = url
response = urlopen(url)
if 'text/html' not in response.getheader('Content-Type'):
return []
self.feed(response.read().decode('utf-8'))
return self.links
# Strip http(s), www, trailing slash and everything after hash sign
# E.g. http://www.domain.com/#id -> domain.com
url_pattern = re.compile("^(?:https?://)?(?:www.)?(.*?)/?(?:#.*)?$")
def url_hash(url):
url = url_pattern.match(url.lower()).group(1)
return hash(url)
# Return the shortest path from *url* to *target* as a list of URLs, no longer than *depth*
def scan(depth, url, target):
target_hash = url_hash(target)
if url_hash(url) == target_hash:
return [url]
path = [url] # path of links from start to current
queue = deque() # queue of URLs to visit; each element contains the full path of links
queue.append(path)
visited = {url_hash(url)} # avoiding visiting a page twice
parser = LinkParser()
reached_bottom = False # avoid adding new links below the maximum depth of search
while queue:
try:
if len(queue[0]) == depth-1:
reached_bottom = True
path = queue.popleft()
url = path[-1]
print(len(path), url + "...", end='', flush=True) # current depth and url
for link in parser.get_links(url):
link_hash = url_hash(link)
if link_hash == target_hash:
print(" found")
return path + [link]
if link_hash in visited:
continue
visited.add(link_hash)
if not reached_bottom:
queue.append(path + [link])
print(" scanned")
except Exception as e:
print(" failed")
print(e)
return []
if __name__ == "__main__":
path = None
try:
path = scan(int(argv[1]), argv[2], argv[3])
print()
except KeyboardInterrupt:
print(" interrupted by user")
exit()
except:
print("Searches the shortest path of links between two web pages")
print("Usage: crawler.py search-depth url-from url-to")
print("Example: crawler.py 3 https://en.wikipedia.org/wiki/Python http://www.python.org/")
exit()
if not path:
print("Path not found")
exit(-1)
for i in range(0, len(path)):
print(i+1, path[i])