system-design-primer/solutions/system_design/web_crawler/web_crawler_mapreduce.py

26 lines
501 B
Python
Raw Normal View History

2017-03-05 08:06:58 +03:00
# -*- coding: utf-8 -*-
from mrjob.job import MRJob
2021-03-14 13:08:05 +03:00
class RemoveDuplicateUrls(MRJob) :
2017-03-05 08:06:58 +03:00
2021-03-14 13:08:05 +03:00
def mapper(self, _, line) :
2017-03-05 08:06:58 +03:00
yield line, 1
2021-03-14 13:08:05 +03:00
def reducer(self, key, values) :
total = sum(values)
2017-03-05 08:06:58 +03:00
if total == 1:
yield key, total
2021-03-14 13:08:05 +03:00
def steps(self) :
2017-03-05 08:06:58 +03:00
"""Run the map and reduce steps."""
return [
self.mr(mapper=self.mapper,
2021-03-14 13:08:05 +03:00
reducer=self.reducer)
2017-03-05 08:06:58 +03:00
]
if __name__ == '__main__':
2021-03-14 13:08:05 +03:00
RemoveDuplicateUrls.run()