#!/usr/bin/python """ This module is designed to count words in text documents then record the results to a file named as the original file but with a .count suffix appended as the extension. The total word count of all those files will be put into a global word count file. """ ## initialization import some modules ################## import string, os, sys, re, copy # some handy strings to use as variables nl, tb, sp, lb, hy, null = "\n", "\t", " ", "#", "-", "" # list of acceptable single character non alpha words symbol_words = ['&'] # internal word symbols that are acceptable # sets the base directory where all of this will live base_path = "/home/kurt/bin/WordsCountProject/SandBox/TestRewrite/" # sub directory that contains the text files to be parsed it will be under the # base path, base_path text_dir = "text/" # reserved directory in which to stash meta data meta_dir = "meta/" # place to put count files it will be under base path, base_path count_dir = "text-count/" # global dictionary for all unique words with the total count in all files global_dict = {} # characters to be removed from front or end of word char = re.compile(r"""[\s?"()\[\]]+""") ################################################ def read_file(file): """ reads a single file in one big chunk reads an individual file designated by the global variable (path) combined with the file name passed by the calling statemnetor function. At this time it reads in the entire file in one chunk but if real large files are to be expected this needs to be changed. The content of the file is placed in the variable(content) and returned to the calling statement or function. depends: global: path passed: file returns: content """ file = open(path + file, 'r') content = file.read()#read entire contents of file into string = content return content def split_content(content): """ splits content string into words list the resulting words may need to receive further work the re module must be imported before this will work. The split is currently on whitespace and a few extra characters. It may be improved but will probably need furthur processing later anyway. depends: import: re global: char passed: content returns: words """ words = re.split(char, string.lower(content[:-1]))# split content into words #print "## length of words = ",len(words),sp,type(words)#debug line #print "file_words_dict = ",file_words_dict#debug line #print "words =",words#debug line return words def fix_words(words): """ repairs, furthur refines, and selects words *STILL NEEDS WORK* This function takes a word list and loops through it to select, reject, or refine each word.as it now stands this function passes not enough words that need to fixed up before being put into the list of acceptable good words. depends: import: string global: symbol_words local: word passed: words returns: good_words """ good_words = [] #start new fresh list for word in words: if word == "": pass elif word.isalpha() or word.isdigit(): good_words.append(word) elif len(word) == 1 and word in symbol_words: good_words.append(word)#add to dictionary elif word.startswith(r"'") and word.endswith(r"'"): #print "here we are and word = ",word#debug line word = word[1:(len(word)-1)] #strip single quotes on start and end #print "here we are now word = ",word#debug line good_words.append(word)#ok for now but maybe should recheck these elif word.count(r"'",1,) or word.count("-",1,len(word)-1): good_words.append(word)#add to dictionary else: #print"is not an acceptable word\n" pass return good_words def count_words(good_words): """count words in current file then put words & count in dictionary The list of good acceptable words are taken one at a time if the word is not already in the dictionary the count is set to 1 and if the word is already in the dictionary the value already associated with that word is incremented by 1. Before, in or after this function is called might be a good place to add more info to the dictionary. depends: local: word passed: good_words returns: file_words_dict """ file_words_dict = {} # to contain all the unique words in file with a count #print "##in start of count_words file_words_dict = ",file_words_dict for word in good_words: if file_words_dict.has_key(word): #print "word = ",word," and vfile_words_dict[word] ----> ",file_words_dict[word] file_words_dict[word] += 1 else: file_words_dict[word] = 1 ##print "#### now file_words_dict = \n",file_words_dict return file_words_dict def write_count_file(file_words_dict,file): """ writes file's word count to related file Creates the file containing each acceptable word that was in the current file. Designed to write first the word along with its count on a single line followed by a newline character to separate each word value pair. depends: import: os import: string global: base_path global: count_dir local: file_words_list local: entry local: s local: count_file local: word_count_list passed: file_words_dict passed: file returns: file_words_list """ #convert file_words_dict to list then sort file_words_list = file_words_dict.items() file_words_list.sort() #print "list of words (file_words_list) to be put in file = \n",file_words_list # now let's save the sorted word list to a file count_file = base_path + count_dir + file + ".count" #print "word_dict = ",word_dict #debug #print results to count file file word_count_list = open(count_file, "w") for entry in file_words_list: for s in entry: print >> word_count_list, s, # write word and count inner loop print >>word_count_list, "\n", # write new line outer loop word_count_list.close() return file_words_list def add_to_total_dict(file_words_list,file): """add valid word to dictionary of total words still needs to be written depends: passed: file_words_dict passed: file returns: total_words_dict """ for entry in file_words_list: word = entry[0:1] count = entry[1:2] print "entry = ",entry,sp,type(entry) print "word = ",word,sp,type(word) print "count = ",count,sp,type(count) if total_words_dict.has_key(word): print "yep, it was there!" else: print "nope not there!" #total_words_dict[entry] print "## WOW! total_words_dict = \n",total_words_dict def write_total(total_words_dict): """ write the total words in a list still needs to be written """ pass ## end functions - start higher level logic ## print "\nHere we go..." ## find a better place for initiating file_words_dict total_words_dict = {} #will contain all the words in all files path = base_path + text_dir for file in os.listdir(path): if os.path.isfile(path + file): # do all this stuff if file is a file not something else print"\nthis file is ##",file #debug line content = read_file(file)# get raw contents of file (file) words = split_content(content)# split raw contents into basic words good_words = fix_words(words)# turn raw words into list of good words file_words_dict = count_words(good_words)# make file_words_dict from good words file_words_list = write_count_file(file_words_dict,file)# write a .count file for current file #print "## file_words_list =\n",file_words_list#debug line total_words_dict = add_to_total_dict(file_words_list,file)# combine total dict with local dict #total_words_dict.update(file_words_dict) #merge dictionaries ignore value else: pass # now save the total word list #total_words_list = total_words_dict.keys() ## END ## print "That's all folks!"