2017-03-05 08:06:58 +03:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
|
|
from mrjob.job import MRJob
|
|
|
|
|
|
|
|
|
2021-03-14 13:08:05 +03:00
|
|
|
class RemoveDuplicateUrls(MRJob) :
|
2017-03-05 08:06:58 +03:00
|
|
|
|
2021-03-14 13:08:05 +03:00
|
|
|
def mapper(self, _, line) :
|
2017-03-05 08:06:58 +03:00
|
|
|
yield line, 1
|
|
|
|
|
2021-03-14 13:08:05 +03:00
|
|
|
def reducer(self, key, values) :
|
|
|
|
total = sum(values)
|
2017-03-05 08:06:58 +03:00
|
|
|
if total == 1:
|
|
|
|
yield key, total
|
|
|
|
|
2021-03-14 13:08:05 +03:00
|
|
|
def steps(self) :
|
2017-03-05 08:06:58 +03:00
|
|
|
"""Run the map and reduce steps."""
|
|
|
|
return [
|
|
|
|
self.mr(mapper=self.mapper,
|
2021-03-14 13:08:05 +03:00
|
|
|
reducer=self.reducer)
|
2017-03-05 08:06:58 +03:00
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
2021-03-14 13:08:05 +03:00
|
|
|
RemoveDuplicateUrls.run()
|