Add Web Crawler solution

This commit is contained in:
Donne Martin
2017-03-04 21:06:58 -08:00
parent e577dda5d9
commit e60de1775e
6 changed files with 450 additions and 0 deletions

View File

@@ -0,0 +1,25 @@
# -*- coding: utf-8 -*-
from mrjob.job import MRJob
class RemoveDuplicateUrls(MRJob):
def mapper(self, _, line):
yield line, 1
def reducer(self, key, values):
total = sum(values)
if total == 1:
yield key, total
def steps(self):
"""Run the map and reduce steps."""
return [
self.mr(mapper=self.mapper,
reducer=self.reducer)
]
if __name__ == '__main__':
RemoveDuplicateUrls.run()