system-design-primer/solutions/system_design/web_crawler/web_crawler_snippets.py

74 lines
2.2 KiB
Python
Raw Normal View History

2017-03-05 08:06:58 +03:00
# -*- coding: utf-8 -*-
2018-07-19 09:09:09 +03:00
2021-03-14 13:08:05 +03:00
class PagesDataStore(object) :
2017-03-05 08:06:58 +03:00
2021-03-14 13:08:05 +03:00
def __init__(self, db) :
2017-03-05 08:06:58 +03:00
self.db = db
pass
2017-03-05 08:06:58 +03:00
2021-03-14 13:08:05 +03:00
def add_link_to_crawl(self, url) :
2017-03-05 08:06:58 +03:00
"""Add the given link to `links_to_crawl`."""
pass
2017-03-05 08:06:58 +03:00
2021-03-14 13:08:05 +03:00
def remove_link_to_crawl(self, url) :
2017-03-05 08:06:58 +03:00
"""Remove the given link from `links_to_crawl`."""
pass
2017-03-05 08:06:58 +03:00
2021-03-14 13:08:05 +03:00
def reduce_priority_link_to_crawl(self, url) :
2017-03-05 08:06:58 +03:00
"""Reduce the priority of a link in `links_to_crawl` to avoid cycles."""
pass
2017-03-05 08:06:58 +03:00
2021-03-14 13:08:05 +03:00
def extract_max_priority_page(self) :
2017-03-05 08:06:58 +03:00
"""Return the highest priority link in `links_to_crawl`."""
pass
2017-03-05 08:06:58 +03:00
2021-03-14 13:08:05 +03:00
def insert_crawled_link(self, url, signature) :
2017-03-05 08:06:58 +03:00
"""Add the given link to `crawled_links`."""
pass
2017-03-05 08:06:58 +03:00
2021-03-14 13:08:05 +03:00
def crawled_similar(self, signature) :
2017-03-05 08:06:58 +03:00
"""Determine if we've already crawled a page matching the given signature"""
pass
2017-03-05 08:06:58 +03:00
2021-03-14 13:08:05 +03:00
class Page(object) :
2017-03-05 08:06:58 +03:00
2021-03-14 13:08:05 +03:00
def __init__(self, url, contents, child_urls) :
2017-03-05 08:06:58 +03:00
self.url = url
self.contents = contents
self.child_urls = child_urls
2021-03-14 13:08:05 +03:00
self.signature = self.create_signature()
2017-03-05 08:06:58 +03:00
2021-03-14 13:08:05 +03:00
def create_signature(self) :
2017-03-05 08:06:58 +03:00
# Create signature based on url and contents
pass
2017-03-05 08:06:58 +03:00
2021-03-14 13:08:05 +03:00
class Crawler(object) :
2017-03-05 08:06:58 +03:00
2021-03-14 13:08:05 +03:00
def __init__(self, pages, data_store, reverse_index_queue, doc_index_queue) :
2017-03-05 08:06:58 +03:00
self.pages = pages
self.data_store = data_store
self.reverse_index_queue = reverse_index_queue
self.doc_index_queue = doc_index_queue
2021-03-14 13:08:05 +03:00
def crawl_page(self, page) :
2017-03-05 08:06:58 +03:00
for url in page.child_urls:
2021-03-14 13:08:05 +03:00
self.data_store.add_link_to_crawl(url)
self.reverse_index_queue.generate(page)
self.doc_index_queue.generate(page)
self.data_store.remove_link_to_crawl(page.url)
self.data_store.insert_crawled_link(page.url, page.signature)
2017-03-05 08:06:58 +03:00
2021-03-14 13:08:05 +03:00
def crawl(self) :
2017-03-05 08:06:58 +03:00
while True:
2021-03-14 13:08:05 +03:00
page = self.data_store.extract_max_priority_page()
2017-03-05 08:06:58 +03:00
if page is None:
break
2021-03-14 13:08:05 +03:00
if self.data_store.crawled_similar(page.signature) :
self.data_store.reduce_priority_link_to_crawl(page.url)
2017-03-05 08:06:58 +03:00
else:
2021-03-14 13:08:05 +03:00
self.crawl_page(page)
page = self.data_store.extract_max_priority_page()