I think I understand:
=== Classic Worker(deprecated) ===
# Here only the byte-code is pickle and sent over the wire
def map(line, params):
for word in line.split():
yield word, 1
def reduce(iter, params):
from disco.util import kvgroup
for word, counts in kvgroup(sorted(iter)):
yield word, sum(counts)
if __name__ == '__main__':
job = Job().run(input=["
http://discoproject.org/media/text/chekhov.txt"],
map=map,
reduce=reduce)
=== New Hotness(preferred) ===
# Here the entire source file is sent over the wire
class WordCountJob(Job):
@staticmethod
def map(line, params):
for word in line.split():
yield word, 1
@staticmethod
def reduce(iter, params):
from disco.util import kvgroup
for word, counts in kvgroup(sorted(iter)):
yield word, sum(counts)
if __name__ == '__main__':
job = WordCountJob().run(input=["
http://discoproject.org/media/text/chekhov.txt"])