mirror of
https://github.com/donnemartin/system-design-primer.git
synced 2025-08-09 04:12:39 +03:00
Add Web Crawler solution
This commit is contained in:
25
solutions/system_design/web_crawler/web_crawler_mapreduce.py
Normal file
25
solutions/system_design/web_crawler/web_crawler_mapreduce.py
Normal file
@@ -0,0 +1,25 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from mrjob.job import MRJob
|
||||
|
||||
|
||||
class RemoveDuplicateUrls(MRJob):
|
||||
|
||||
def mapper(self, _, line):
|
||||
yield line, 1
|
||||
|
||||
def reducer(self, key, values):
|
||||
total = sum(values)
|
||||
if total == 1:
|
||||
yield key, total
|
||||
|
||||
def steps(self):
|
||||
"""Run the map and reduce steps."""
|
||||
return [
|
||||
self.mr(mapper=self.mapper,
|
||||
reducer=self.reducer)
|
||||
]
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
RemoveDuplicateUrls.run()
|
Reference in New Issue
Block a user