Питоничность кода - Python

ds.Dante · 08.01.2018, 18:44

Решил на днях освоить Питон, написал уже пару скриптов. Второй скрипт ищет путь от одной веб-страницы к другой по ссылкам. Прошу посмотреть и сказать, что бы вы поправили по стилю, и что - с учётом внутренней работы интерпретатора.

Код:

#!/usr/bin/env python3

# Searches the shortest path of links between two web pages
# Usage:   crawler.py search-depth url-from url-to
# Example: crawler.py 3 https://en.wikipedia.org/wiki/Python http://python.org

from sys import argv
from collections import deque
from html.parser import HTMLParser
from urllib.request import urlopen
from urllib import parse
import re

# Parse HTML, return links only
class LinkParser(HTMLParser):
    def handle_starttag(self, tag, attributes):
        if tag != 'a':
            return
        for (key, value) in attributes:
            if key == 'href':
                new_url = parse.urljoin(self.base_url, value)
                self.links.append(new_url)
                return

    def get_links(self, url):
        self.links = []
        self.base_url = url
        response = urlopen(url)
        if 'text/html' not in response.getheader('Content-Type'):
            return []
        self.feed(response.read().decode('utf-8'))
        return self.links


# Strip http(s), www, trailing slash and everything after hash sign
# E.g. http://www.domain.com/#id -> domain.com
url_pattern = re.compile("^(?:https?://)?(?:www.)?(.*?)/?(?:#.*)?$")
def url_hash(url):
    url = url_pattern.match(url.lower()).group(1)
    return hash(url)


# Return the shortest path from *url* to *target* as a list of URLs, no longer than *depth*
def scan(depth, url, target):
    target_hash = url_hash(target)
    if url_hash(url) == target_hash:
        return [url]
    path = [url]  # path of links from start to current
    queue = deque()  # queue of URLs to visit; each element contains the full path of links
    queue.append(path)
    visited = {url_hash(url)}  # avoiding visiting a page twice
    parser = LinkParser()
    reached_bottom = False  # avoid adding new links below the maximum depth of search

    while queue:
        try:
            if len(queue[0]) == depth-1:
                reached_bottom = True
            path = queue.popleft()
            url = path[-1]
            print(len(path), url + "...", end='', flush=True)  # current depth and url

            for link in parser.get_links(url):
                link_hash = url_hash(link)
                if link_hash == target_hash:
                    print(" found")
                    return path + [link]
                if link_hash in visited:
                    continue
                visited.add(link_hash)
                if not reached_bottom:
                    queue.append(path + [link])
            print(" scanned")

        except Exception as e:
            print(" failed")
            print(e)
    return []


if __name__ == "__main__":
    path = None
    try:
        path = scan(int(argv[1]), argv[2], argv[3])
        print()
    except KeyboardInterrupt:
        print(" interrupted by user")
        exit()
    except:
        print("Searches the shortest path of links between two web pages")
        print("Usage:   crawler.py search-depth url-from url-to")
        print("Example: crawler.py 3 https://en.wikipedia.org/wiki/Python http://www.python.org/")
        exit()

    if not path:
        print("Path not found")
        exit(-1)

    for i in range(0, len(path)):
            print(i+1, path[i])

Похожие темы
Тема	Автор	Раздел	Ответов	Последнее сообщение
Компиляцию кода MAtlab в C++Builder и использование преобразованного кода .м в C++Builder	Leg1oner	C++ Builder	0	07.01.2014 18:15
Распознавание Аски кода и скан кода	MyLastHit	Общие вопросы Delphi	8	06.12.2010 18:34
Восстановление кода	Warhead	Microsoft Office Excel	1	23.03.2010 00:29
Оптимизация кода	nusik	Общие вопросы Delphi	2	21.05.2009 17:55
Выдернуть куски кода из html-кода	trafbite	Помощь студентам	7	18.08.2007 13:51