# -*- coding: utf-8 -*- class PagesDataStore(object) : def __init__(self, db) : self.db = db pass def add_link_to_crawl(self, url) : """Add the given link to `links_to_crawl`.""" pass def remove_link_to_crawl(self, url) : """Remove the given link from `links_to_crawl`.""" pass def reduce_priority_link_to_crawl(self, url) : """Reduce the priority of a link in `links_to_crawl` to avoid cycles.""" pass def extract_max_priority_page(self) : """Return the highest priority link in `links_to_crawl`.""" pass def insert_crawled_link(self, url, signature) : """Add the given link to `crawled_links`.""" pass def crawled_similar(self, signature) : """Determine if we've already crawled a page matching the given signature""" pass class Page(object) : def __init__(self, url, contents, child_urls) : self.url = url self.contents = contents self.child_urls = child_urls self.signature = self.create_signature() def create_signature(self) : # Create signature based on url and contents pass class Crawler(object) : def __init__(self, pages, data_store, reverse_index_queue, doc_index_queue) : self.pages = pages self.data_store = data_store self.reverse_index_queue = reverse_index_queue self.doc_index_queue = doc_index_queue def crawl_page(self, page) : for url in page.child_urls: self.data_store.add_link_to_crawl(url) self.reverse_index_queue.generate(page) self.doc_index_queue.generate(page) self.data_store.remove_link_to_crawl(page.url) self.data_store.insert_crawled_link(page.url, page.signature) def crawl(self) : while True: page = self.data_store.extract_max_priority_page() if page is None: break if self.data_store.crawled_similar(page.signature) : self.data_store.reduce_priority_link_to_crawl(page.url) else: self.crawl_page(page) page = self.data_store.extract_max_priority_page()