system-design-primer/solutions/system_design/pastebin/pastebin.py

47 lines
1.0 KiB
Python
Raw Normal View History

2017-03-05 08:05:53 +03:00
# -*- coding: utf-8 -*-
from mrjob.job import MRJob
2021-03-14 13:08:05 +03:00
class HitCounts(MRJob) :
2017-03-05 08:05:53 +03:00
2021-03-14 13:08:05 +03:00
def extract_url(self, line) :
2017-03-05 08:05:53 +03:00
"""Extract the generated url from the log line."""
pass
2021-03-14 13:08:05 +03:00
def extract_year_month(self, line) :
2017-03-05 08:05:53 +03:00
"""Return the year and month portions of the timestamp."""
pass
2021-03-14 13:08:05 +03:00
def mapper(self, _, line) :
2017-03-05 08:05:53 +03:00
"""Parse each log line, extract and transform relevant lines.
Emit key value pairs of the form:
2021-03-14 13:08:05 +03:00
(2016-01, url0) , 1
(2016-01, url0) , 1
(2016-01, url1) , 1
2017-03-05 08:05:53 +03:00
"""
2021-03-14 13:08:05 +03:00
url = self.extract_url(line)
period = self.extract_year_month(line)
yield (period, url) , 1
2017-03-05 08:05:53 +03:00
2021-03-14 13:08:05 +03:00
def reducer(self, key, values) :
2017-03-05 08:05:53 +03:00
"""Sum values for each key.
2021-03-14 13:08:05 +03:00
(2016-01, url0) , 2
(2016-01, url1) , 1
2017-03-05 08:05:53 +03:00
"""
2021-03-14 13:08:05 +03:00
yield key, sum(values)
2017-03-05 08:05:53 +03:00
2021-03-14 13:08:05 +03:00
def steps(self) :
2017-03-05 08:05:53 +03:00
"""Run the map and reduce steps."""
return [
self.mr(mapper=self.mapper,
2021-03-14 13:08:05 +03:00
reducer=self.reducer)
2017-03-05 08:05:53 +03:00
]
if __name__ == '__main__':
2021-03-14 13:08:05 +03:00
HitCounts.run()