2017-03-05 08:05:53 +03:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
|
|
from mrjob.job import MRJob
|
|
|
|
|
|
|
|
|
2021-03-26 19:50:38 +03:00
|
|
|
class HitCounts(MRJob):
|
2017-03-05 08:05:53 +03:00
|
|
|
|
2021-03-26 19:50:38 +03:00
|
|
|
def extract_url(self, line):
|
2017-03-05 08:05:53 +03:00
|
|
|
"""Extract the generated url from the log line."""
|
|
|
|
pass
|
|
|
|
|
2021-03-26 19:50:38 +03:00
|
|
|
def extract_year_month(self, line):
|
2017-03-05 08:05:53 +03:00
|
|
|
"""Return the year and month portions of the timestamp."""
|
|
|
|
pass
|
|
|
|
|
2021-03-26 19:50:38 +03:00
|
|
|
def mapper(self, _, line):
|
2017-03-05 08:05:53 +03:00
|
|
|
"""Parse each log line, extract and transform relevant lines.
|
|
|
|
|
|
|
|
Emit key value pairs of the form:
|
|
|
|
|
2021-03-26 19:50:38 +03:00
|
|
|
(2016-01, url0), 1
|
|
|
|
(2016-01, url0), 1
|
|
|
|
(2016-01, url1), 1
|
2017-03-05 08:05:53 +03:00
|
|
|
"""
|
2021-03-26 19:50:38 +03:00
|
|
|
url = self.extract_url(line)
|
|
|
|
period = self.extract_year_month(line)
|
|
|
|
yield (period, url), 1
|
2017-03-05 08:05:53 +03:00
|
|
|
|
2021-03-26 19:50:38 +03:00
|
|
|
def reducer(self, key, values):
|
2017-03-05 08:05:53 +03:00
|
|
|
"""Sum values for each key.
|
|
|
|
|
2021-03-26 19:50:38 +03:00
|
|
|
(2016-01, url0), 2
|
|
|
|
(2016-01, url1), 1
|
2017-03-05 08:05:53 +03:00
|
|
|
"""
|
2021-03-26 19:50:38 +03:00
|
|
|
yield key, sum(values)
|
2017-03-05 08:05:53 +03:00
|
|
|
|
2021-03-26 19:50:38 +03:00
|
|
|
def steps(self):
|
2017-03-05 08:05:53 +03:00
|
|
|
"""Run the map and reduce steps."""
|
|
|
|
return [
|
|
|
|
self.mr(mapper=self.mapper,
|
2021-03-26 19:50:38 +03:00
|
|
|
reducer=self.reducer)
|
2017-03-05 08:05:53 +03:00
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
2021-03-26 19:50:38 +03:00
|
|
|
HitCounts.run()
|