#!/usr/bin/python """ count words in text documents then record the results to a file """ ###################### INITIALIZATION ###################### import string, os, sys, re, copy, cPickle, shelve,types, math # may be reduced later some moved to function where required ## Globals # some handy strings to use as variables nl, tb, sp, lb, hy, null = "\n", "\t", " ", "#", "-", "" # list of acceptable single character non alpha words symbol_words = ['&'] # sets the base directory where being run base_path = os.getcwd() # characters to be removed from front or end of word char = re.compile(r"""[\s\.,:;!?"\(\)\[\]]+""") #will eventually contain all the unique words in all files with file names etc. total_words_dict = {} # will contain file names with computed meta values file_name_dict = {} # TWEAKS # low_range = 20 # low range in percentage of documents with word high_range = 30 # high range in percentage of documents with word ###################### FUNCTIONS ###################### def debug(*args): """debug aid from Steve puts debug info to stderr """ for a in args: sys.stderr.write('' + str(a)) sys.stderr.write(LF) def initalize(): """ Test file structure offer to make if not there already variables local other than base_path, returns answer to build file structure by returning string, ans. """ # default answer to pass test if not asked later ans = "Y" # lets see if directories are present with boolean, stat_dir stat_dir = chk_dir(base_path) # if they aren't there let user choose to have them created if not stat_dir: print """You don't have the proper directory structure under your current directory.""" # ask and receive answer ans = raw_input( "Would you like me to create them for you? (Y/N): ") # test answer if ans.lower() == "y": mk_dirs(base_path)# only if directories do not exist make them else: print "Please come back again when you are ready." # set to "stop" to skip rest of program if answer not yes ans = "stop" # continue if there are text files present and directories exist # or user had them created return ans def chk_dir(base_path): """ Checks if directory sturcture is correct called by initalize() tests for all the necessary directory structures and returns the status in the variable stat_dir. Will be True if they exist and False if they don't """ stat_dir = ( os.path.isdir("meta/Pcount_files") and os.path.isdir("text") ) #tests to see if these directories exist return stat_dir def mk_dirs(base_path): """ Part of initilazation that makes directory structure if not there If the user wants the directory structure to be built automatically this is where that happens. Only run if directories are mising and the user wants them created. """ import os if not os.path.isdir("text"): os.mkdir("text") if not os.path.isdir("meta/Pcount_files"): os.makedirs("meta/Pcount_files") def create_basic_objects(): """ Higher logic put in module to reduce clutter This is where the bulk of work takes place in building objects and saving to disk. The variable, file_count, seems to be all that needs to be passed on as a global and it doesn't seem worth it to save it to disk. The single most important object, total_words_dict, is/can be cPickled and shelved shortly after calling this function. """ global file_count #makes file_count globaly available # if the text directory is there and not told to stop continue if os.listdir("text/") and ans != "stop": # loop through each file in the text directory text_dir = base_path + "/text/" # text_dir is name of the directory all_file_list = [] # will hold list of all files parsed # loop through each file in text_dir for text_file in os.listdir(text_dir): # test if text_file is a file not a directory if os.path.isfile(text_dir + text_file): # make list of all file names all_file_list.append(text_file) # content is text of current file content = read_content(text_file) # split content into basic words words = split_into_words(content) # list of refined words good_words = make_good_words(words) # dict of word/count file_words_dict = make_file_words_dict(good_words) # now tweak file words dictionary file_words_dict,file_name_dict = tweak_file_words_dict( file_words_dict,text_file) # list from dict file_words_list = file_words_dict.items() # pickle count dict pkl_ind_count(file_words_dict,text_file) # next combine total dict with individual file dictionary total_words_dict = add_to_total_words_dict( file_words_list,text_file) else: pass # wasn't a file so go on to next # if directroy did not exist and told to stop else: print "Sorry you have no text files in ",base_path + "/text/" print "Please put text files to be parsed in that directory and try again." raise SystemExit # will return None or finished dictionary return total_words_dict, all_file_list,file_name_dict def read_content(file): """ reads text file in one big string reads an individual file designated by the parameter (file) which is the file name including path passed by the calling statemnet or function. At this time it reads in the entire file in one chunk but if real large files are to be expected this needs to be changed. The content of the file is placed in the variable(content) and returned to the calling statement or function. requires: passed: file returns: content """ rdfile = open(base_path + "/text/" + file, 'r') # open content = rdfile.read()#read entire contents of file into string = content return content def split_into_words(content): """ split content string into words list the resulting words may need to receive further work the re module must be imported before this will work. The split is currently on whitespace and a few extra characters. It may be improved but will probably need furthur processing later anyway. requires: import: re global: char passed: content returns: words """ words = re.split(char, string.lower(content[:-1]))# split content into words return words def make_good_words(words): """ Remove unwanted characters from words repairs and furthur refines, and selects words *STILL NEEDS WORK* This function takes a word list and loops through it to select, reject, or refine each word.as it now stands this function passes not enough words that need to fixed up before being put into the list of acceptable good words. STILL NEEDS WORK Consider breaking this function down into smaller reusable functions. requires: import: string global: symbol_words local: word passed: words returns: good_words """ good_words = [] #start new fresh list for word in words: if word.isalpha() or word.isdigit(): good_words.append(word) elif len(word) == 1 and word in symbol_words: good_words.append(word)#add to dictionary elif word.startswith(r"'") and word.endswith(r"'"): word = word[1:(len(word)-1)] #strip single quotes on start and end good_words.append(word)#ok for now but maybe should recheck these # this portion designed to pass hyphenated and contracted words # but not allowing a word just containing hyphen or single quote elif word.count(r"'",1,) or word.count("-",1,len(word)-1): if not re.compile("[\s\d\w]").search(word): pass #word = ""# sets to null if word is all punctuation else: if word != "": good_words.append(word)#add to dictionary elif word != "": good_words.append(word) else: pass # wasn't fixed, so pass without adding to list return good_words def make_file_words_dict(good_words): """Count unique words in and put in dictionary Unique words from current file are counted and put in dictionary, file_words_dict. The list of good acceptable words are taken one at a time and if the word is already in the dictionary the count is incremented by 1 and if the word is not already in the dictionary the word is added to the dictionary and the value is set to 1. requires: local: word passed: good_words returns: file_words_dict """ global file_words_dict file_words_dict = {} # to contain all the unique words in file with a count # go through each word in the good_words list' for word in good_words: # one more test for empty string getting past if word == "": pass # check if word is in dictionary elif file_words_dict.has_key(word): # yes it was there so increment by 1 file_words_dict[word] += 1 else: # no was not there so put it there file_words_dict[word] = 1 # return the dictionary to caller return file_words_dict def tweak_file_words_dict(file_words_dict,file): """ Do more with file_words_dict file_words_dict contains the unique words in the text file, file with each unique word being the key and value being the total number of times that word appears in the file. This dictionary is created for each file in the function, make_file_words_dict. Here we will do some calculations and add the results to the dictionary. Calculate the total number of words in the text file. This might be able to be done in make_file_words_dict but for now keep separate since it would not be a simple len(good words) since some "" words are sneaking through the filters and would skew the count. Calculate the mean by dividing total words by unique word count. Calculate variance and standard deviation in a second loop now we have the mean. The variance is the mean of the squared differences from the mean of the distribution, total words. Standard deviation is the positive square root of the variance. Calculate the z-score of each word by taking the distance from the mean and dividing by the standard deviation. Put the results back into the dictionary as a tuple along with the original value. Consider doing this with nested function(s), at least for looping through the dictionary. While we are at it let's store some more data in a dictionary with unique file names as key and some of the results of our calculations, for later use. """ import math # needed for sqrt total_words = 0 sum_of_squares = 0 #print "## file = ",file #debug # calculate total number of words for key in file_words_dict: value = file_words_dict[key] # get count of occurrence in file total_words += value # increment to get total # calculate the mean, unique words will be same as lenght of dictionary number_unique_words = len(file_words_dict) # mean is total words divided by unique words # make float for division to produce desired results mean = float(total_words) / float(number_unique_words) # calculate variance & standard deviation , must loop again because the # calculations for each word requires knowing results from first loop for key in file_words_dict: value = file_words_dict[key] # get count again distance = abs(value - mean) # distance from mean square_of_difference = distance ** 2 # square difference from mean sum_of_squares += square_of_difference # increment sum of squares # calculate variance and standard deviation for file variance = sum_of_squares / total_words standard_deviation = abs(math.sqrt(variance)) # find z score, requires results from first two loops for key in file_words_dict: # value is how many instances of unique word appear in file value = file_words_dict[key] distance = abs(value - mean) # how far from mean z_score = distance / standard_deviation# calculate z-score new_value = (value,z_score) # add to dictionary as tuple with value file_words_dict[key] = new_value # store in dictionary # make a tuple to use as value when creating dictionary of file meta values meta_values = ( total_words, number_unique_words, mean, variance, standard_deviation ) # make file name dictionary with meta values make_file_name_dict(file,meta_values) return file_words_dict,file_name_dict def make_file_name_dict(file,meta_values): """ Creates dictionary for key = file name and value = meta values """ file_name_dict[file] = meta_values return file_name_dict def pkl_ind_count(file_words_dict,file): """ Pickles the file_words_dict of individual files and puts the dictionary of unique words with their count into a cPickle file for later use. Ultimately this function may not be needed requires: import cPickle passed file_words_dict passed file local pf local filename returns: None """ pf = file[ 0 : file.find(".")]#removes extension if it exists filename = base_path + "/meta/Pcount_files/" + pf# v #sets n = meta dir with text file's name minus extension f = open(filename, "w") cPickle.dump(file_words_dict,f) f.close def add_to_total_words_dict(file_words_list,file): """Adds unique words in text file to dictionary of total words Add a valid word to the dictionary of total words, total_words_dict. If a word is already in the dictionary include the count from the individual file along with the name of the text file where it was just found. If the word was not already in total_words_dict dictionary it will be added as a new key value pair. This process will create the value as a tuple consisting of the text file's name, the number of times that word appears in that file and the word's z-score for that file. See tweak_file_words_dict for more info on z-score and other computed values. requires: passed: file_words_dict passed: file returns: total_words_dict """ for entry in file_words_list: word = entry[0:1] [0]# get just word count = entry[1][0] # get just count z = entry[1][1] if total_words_dict.has_key(word): # yep, it was there so add new value value = total_words_dict[word] # get previous value new_value = value + (file,count,z) # add on new value total_words_dict[word] = new_value # put it back in #print "## TWD = ",total_words_dict[word]#debug else: #not there so make new key/value total_words_dict[word] = (file,count,z) oops = total_words_dict[word] # the key object of this module being returned to caller return total_words_dict def pickle_obj(filename,obj): """ saves object to file by pickling file = file where object is to be saved filename = name of file dict = the dictionary to be saved requires: import cPickle passed filename passed obj local file """ file = open( filename, "w" ) cPickle.dump( obj, file ) file.close() def store_in_shelve( obj, fn, fl ): """ put object in shelve obj = object to shelve fn = file name for shelve fl = flagg for type of operation on file r,w, etc. db = what to shelve ??? requires: passed obj passed fn passed fl local db """ import shelve db = shelve.open( fn, fl ) db = obj # db.sync() def prompt_percentage(): """ receives input for range of percentage asks for two numbers in the range 0 to 100 and uses them as the range in percentage to search for relevant words NEEDS WORK should have better error and range handling """ print "please enter two numbers in the range of 0 to 100 ", print "these will form the range in percentage in which to ", print "look for words relating documents. " low_range = input("Please enter lower number 0 to 99: ")#imput low_range high_range = input("Please enter higher number 1 to 100: ")#input high_range print nl,"your range is ",low_range," to ",high_range," is this correct? " yn = raw_input("enter y for yes or n for no y/n: ")#input y or n yn = yn.lower()# make lowercase if not already if yn == "y": print "YES"#debug return low_range, high_range if yn == "n": print "NO"#debug low_range,high_range = 0,0 prompt_percentage() return low_range,high_range def list_count(list): """Count the number of items in a list """ count = len(list) return count def build_pct_lst(dict,count): """ figure % files contain word creates list, pct_lst, which contains a list of unique words with the percentage of files that contain that particular word. Does not contain list of files but that can be retrieved from total_words_dict by using the word as the key. NEEDS WORK but is ok as is (I think, maybe) """ list = dict.items() pct_lst= [] for item in list: ##print "item = ",item,"# ",len(item[0])#debug entry = item[0:1] word = str(entry[0]) # this is sort of a bad way to do it, the length of item needs to be # divided by three since each sequence contains three atoms pct = ( (float( len(item[1]) )/3) / 2) / (float(count)) * 100 pct_lst.append( ( word, pct ) ) return pct_lst def write_list(filename,list): """ Write a list to a text file just writes a list, list to a file, filename """ file = open(filename, "w") file.write(str(list)) file.flush() #for item in list: # print item print "length of list = ",len(list) def show_range(list,low_range,high_range): """ Shows words in range """ for item in list: if high_range > item[1] > low_range: word = str(item[0:1][0]) pct = item[1] print word,"appears in ",round(pct,2),"% of documents" def make_list_range(list,low_range,high_range): """Creates list of unique words in range in % of documents containing them """ list_range = [] for item in list: if high_range > item[1] > low_range: word = str(item[0:1][0]) pct = item[1] list_range.append( ( word, pct ) ) return list_range def files_have_range_word(word,count_dict,file_name_dict): """ Make list of text documents containing a specified word count_dict will normally be total_words_dict and word will be a word that appears within a percentage of documents in a specified range. Note: checking type was the easiest way to sort out the dictionary it was a bit of a dirty way to do it, but it should get by for now. """ from types import * pos_lst = [] neg_lst = [] #print "\n## word = ",word # a word in range value = count_dict[word] # gets entire value #print " value = ",value #debug for atom in value: if type(atom) == StringType: pos_lst.append(atom) for key in file_name_dict: if key not in pos_lst: neg_lst.append(key) return pos_lst,neg_lst ###################### BODY ###################### print "\nHERE WE GO!..." #debug ans = initalize() # holds answer for prompt #VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV # next the unique words will go in dictionary, total_words_dict ~ a list of # # all the files is in all_file_list ~ dictionary of file names in the form of # # key = file name value = tuple with (total_words, number_unique_words, # # mean, variance, & standard_deviation) in that order # total_words_dict, all_file_list,file_name_dict = create_basic_objects() # next save total_words_dict as an object for use later pickle_obj( base_path + "/meta/" + "total_words_dict.obj",total_words_dict) # now save all_file_list pickle_obj( base_path + "/meta/" + "all_file_list.obj",all_file_list) # the shelve might be better for large amounts of data and is here for testing store_in_shelve(total_words_dict,(base_path + "/meta/total_words"), "c") # do the same with all_file_list store_in_shelve(all_file_list,(base_path + "/meta/all_file_list"), "c") # count total number of files by finding length of list with all file names num_all_files = list_count(all_file_list) # make a list of words with percentage of documents containing that word pct_lst = build_pct_lst(total_words_dict,num_all_files) ## post pickling ## # we may need to make some local variables global as needed # range "x% to y%" is short cut for range "low_range to high_range" show_range(pct_lst,low_range,high_range) # display words in range x% to y% # now make a list of words in range x% to y% list_range = make_list_range(pct_lst,low_range,high_range) ## print "list_range = ",list_range #debug # Eventually make next gaggle of code a function # Will loop through the words in range x% to y% pos_lst = [] # do it here for now neg_lst = [] # do it here for now range_dict = {} # do it here for now for item1 in list_range: #print"item = ", item1[1],item1[0]#debug wurd = item1[0] pos_lst,neg_lst = files_have_range_word( wurd,total_words_dict,file_name_dict) range_dict[item1] = (pos_lst, neg_lst), title = wurd.upper() + " " print "## Word = ",title * 7 print "# pos_lst = ",pos_lst # debug print "neg_lst = ",neg_lst #debug print "ALL FINISHED, That's all folks!" #debug ###################### END #######################