[code.view]

[top] / python / PyMOTW / multiprocessing / multiprocessing_wordcount.py

     #!/usr/bin/env python
     # encoding: utf-8
     #
     # Copyright (c) 2009 Doug Hellmann All rights reserved.
     #
     """
     """
     #end_pymotw_header
     import multiprocessing
     import string
     
     from multiprocessing_mapreduce import SimpleMapReduce
     
     def file_to_words(filename):
         """Read a file and return a sequence of (word, occurances) values.
         """
         STOP_WORDS = set([
                 'a', 'an', 'and', 'are', 'as', 'be', 'by', 'for', 'if', 'in', 
                 'is', 'it', 'of', 'or', 'py', 'rst', 'that', 'the', 'to', 'with',
                 ])
         TR = string.maketrans(string.punctuation, ' ' * len(string.punctuation))
     
         print multiprocessing.current_process().name, 'reading', filename
         output = []
     
         with open(filename, 'rt') as f:
             for line in f:
                 if line.lstrip().startswith('..'): # Skip rst comment lines
                     continue
                 line = line.translate(TR) # Strip punctuation
                 for word in line.split():
                     word = word.lower()
                     if word.isalpha() and word not in STOP_WORDS:
                         output.append( (word, 1) )
         return output
     
     
     def count_words(item):
         """Convert the partitioned data for a word to a
         tuple containing the word and the number of occurances.
         """
         word, occurances = item
         return (word, sum(occurances))
     
     
     if __name__ == '__main__':
         import operator
         import glob
     
         input_files = glob.glob('*.rst')
         
         mapper = SimpleMapReduce(file_to_words, count_words)
         word_counts = mapper(input_files)
         word_counts.sort(key=operator.itemgetter(1))
         word_counts.reverse()
         
         print '\nTOP 20 WORDS BY FREQUENCY\n'
         top20 = word_counts[:20]
         longest = max(len(word) for word, count in top20)
         for word, count in top20:
             print '%-*s: %5s' % (longest+1, word, count)
     
     

[top] / python / PyMOTW / multiprocessing / multiprocessing_wordcount.py

contact | logmethods.com