#!/usr/bin/python
""" count words in text documents then record the results to a file """
######################           INITIALIZATION           ######################
import string, os, sys, re, copy, cPickle, shelve,types, math
# may be reduced later some moved to function where required 
## Globals  
# some handy strings to use as variables 
nl, tb, sp, lb, hy, null = "\n", "\t", " ", "#", "-", ""

# list of acceptable single character non alpha words
symbol_words = ['&']

# sets the base directory where being run
base_path = os.getcwd()

# characters to be removed from front or end of word
char = re.compile(r"""[\s\.,:;!?"\(\)\[\]]+""")

#will eventually contain all the unique words in all files with file names etc.
total_words_dict = {}
# will contain file names with computed meta values
file_name_dict = {}
# TWEAKS #
low_range = 20 # low range in percentage of documents with word
high_range = 30 # high range in percentage of documents with word
######################              FUNCTIONS             ######################
def debug(*args):
	"""debug aid from Steve
	puts debug info to stderr
	"""
	for a in args:
		sys.stderr.write('' + str(a))
	sys.stderr.write(LF)

def initalize():
	""" Test file structure offer to make if not there already 
	variables local other than base_path, returns answer to build file 
	structure by returning string, ans.
	"""
	# default answer to pass test if not asked later 
	ans = "Y"
	# lets see if directories are present with boolean, stat_dir
	stat_dir = chk_dir(base_path)
	# if they aren't there let user choose to have them created 
	if not stat_dir:
		print """You don't have the proper directory structure 
		under your current directory."""
		# ask and receive answer
		ans = raw_input(
		"Would you like me to create them for you? (Y/N): ")
		# test answer
		if ans.lower() == "y":
			mk_dirs(base_path)# only if directories do not exist make them
		else:
			print "Please come back again when you are ready."
			# set to "stop" to skip rest of program if answer not yes
			ans = "stop"
	# continue if there are text files present and directories exist 
	# or user had them created 
	return ans

def chk_dir(base_path):
	""" Checks if directory sturcture is correct called by initalize()
	tests for all the necessary directory structures and returns the status in 
	the variable stat_dir. Will be True if they exist and False if they don't
	"""
	stat_dir = ( os.path.isdir("meta/Pcount_files") 		
		and os.path.isdir("text")
		) #tests to see if these directories exist 
	return stat_dir
	
def mk_dirs(base_path):
	""" Part of initilazation that makes directory structure if not there 
	If the user wants the directory structure to be built automatically this is 
	where that happens. Only run if directories are mising and the user wants 
	them created.
	"""
	import os
	if not os.path.isdir("text"):
		os.mkdir("text")
	if not os.path.isdir("meta/Pcount_files"):
		os.makedirs("meta/Pcount_files")

def create_basic_objects():
	""" Higher logic put in module to reduce clutter 
	This is where the bulk of work takes place in building objects and saving 
	to disk. The variable, file_count, seems to be all that needs to be passed 
	on as a global and it doesn't seem worth it to save it to disk. The single 
	most important object, total_words_dict, is/can be cPickled and shelved 
	shortly after calling this function.	
	"""
	global file_count #makes file_count globaly available
	# if the text directory is there and not told to stop continue 
	if os.listdir("text/") and ans != "stop":
		# loop through each file in the text directory 
		text_dir = base_path + "/text/" # text_dir is name of the directory 
		all_file_list = [] # will hold list of all files parsed
		# loop through each file in text_dir 
		for text_file in os.listdir(text_dir):
			# test if text_file is a file not a directory 
			if os.path.isfile(text_dir + text_file):
				# make list of all file names
				all_file_list.append(text_file)
				# content is text of current file 
				content = read_content(text_file)
				# split content into basic words	
				words = split_into_words(content)
				# list of refined words
				good_words = make_good_words(words)
				# dict of word/count 
				file_words_dict = make_file_words_dict(good_words)
				# now tweak file words dictionary
				file_words_dict,file_name_dict = tweak_file_words_dict(
					file_words_dict,text_file)
				# list from dict 
				file_words_list = file_words_dict.items()
				# pickle count dict
				pkl_ind_count(file_words_dict,text_file)
				# next combine total dict with individual file dictionary
				total_words_dict = add_to_total_words_dict(
					file_words_list,text_file)
			else:
				pass # wasn't a file so go on to next 
	# if directroy did not exist and told to stop 
	else:
		print "Sorry you have no text files in ",base_path + "/text/"
		print "Please put text files to be parsed in that directory and try again."
		raise SystemExit
	# will return None or finished dictionary 
	return total_words_dict, all_file_list,file_name_dict

def read_content(file):
	""" reads text file in one big string
	reads an individual file designated by the parameter (file) which is the 
	file name including path passed by the calling statemnet or function. 
	At this time it reads in the entire file in one chunk but if real large 
	files are to be expected this needs to be changed. The content of the file 
	is placed in the variable(content) and returned to the calling statement or 
	function.
	
	requires:
		passed: file <string>
	returns:
		content <string>
	"""
	rdfile = open(base_path + "/text/" + file, 'r') # open
	content = rdfile.read()#read entire contents of file into string = content
	return content
	
def split_into_words(content):
	""" split content string into words list 
	the resulting words may need to receive further 	work 	the re module 
	must be imported before this will 	work. The split is currently on 
	whitespace and a few extra 	characters. It may be improved but will 
	probably need furthur processing later anyway.
	
	requires:
		import: re <module>
		global: char <string>
		passed: content <string>
	returns: words <list>
	"""
	words = re.split(char, string.lower(content[:-1]))# split content into words
	return words

def make_good_words(words):
	""" Remove unwanted characters from words
	repairs and furthur refines, and selects words *STILL NEEDS WORK*
	
	This function takes a word list and loops through	it to select, reject, 
	or refine each word.as it now 	stands this function passes not enough 
	words that need to fixed up before being put into the list of acceptable 
	good words.
	STILL NEEDS WORK
	Consider breaking this function down into smaller reusable functions.
	
	requires:
		import: string <module>
		global: symbol_words <string>
		local: word <string>
		passed: words <list>
	returns: good_words <list>
"""
	good_words = [] #start new fresh list
	for word in words:
		if word.isalpha() or word.isdigit():
			good_words.append(word)
		elif len(word) == 1 and  word in symbol_words:
			good_words.append(word)#add to dictionary
		elif  word.startswith(r"'") and word.endswith(r"'"):
			word = word[1:(len(word)-1)] #strip single quotes on start and end
			good_words.append(word)#ok for now but maybe should recheck these	
			
			# this portion designed to pass hyphenated and contracted words 
			# but not allowing a word just containing hyphen or single quote 
		elif word.count(r"'",1,) or word.count("-",1,len(word)-1):			
			if not re.compile("[\s\d\w]").search(word):
				pass
				#word = ""# sets to null if word is all punctuation 
			else:
				if word != "":
					good_words.append(word)#add to dictionary
		elif word != "":
			good_words.append(word)
		else:
			pass # wasn't fixed, so pass without adding to list
	return good_words
	
def make_file_words_dict(good_words):
	"""Count unique words in and put in dictionary
	Unique words from current file are counted and put in dictionary, 
	file_words_dict. The list of good acceptable words are taken one at a time 
	and if the word is already in the dictionary the count is incremented by 1 
	and if the word is not already in the dictionary the word is added to the 
	dictionary and the value is set to 1.
	
	requires:
		local: word <string>
		passed: good_words <list>
	returns: file_words_dict <dictionary>
	"""
	global file_words_dict
	file_words_dict = {} # to contain all the unique words in file with a count 
	# go through each word in the good_words list'
	for word in good_words:
		# one more test for empty string getting past		
		if word == "":
			pass
		# check if word is in dictionary
		elif file_words_dict.has_key(word):
			# yes it was there so  increment by 1
			file_words_dict[word]  += 1
		else:
			# no was not there so put it there
			file_words_dict[word] = 1
	# return the dictionary to caller
	return file_words_dict
	
def tweak_file_words_dict(file_words_dict,file):
	""" Do more with file_words_dict
	file_words_dict contains the unique words in the text file, file with each 
	unique word being the key and value being the total number of times that 
	word appears in the file. This dictionary is created for each file in the 
	function, make_file_words_dict. Here we will do some calculations and add 
	the results to the dictionary.
	Calculate the total number of words in the text file. This might be able to 
	be done in make_file_words_dict but for now keep separate since it would 
	not be a simple len(good words) since some "" words are sneaking through 
	the filters and would skew the count.
	Calculate the mean by dividing total words by unique word count.
	Calculate variance and standard deviation in a second loop now we have the 
	mean. The variance is the mean of the squared differences from the mean of 
	the distribution, total words. Standard deviation is the positive square 
	root of the variance.
	Calculate the z-score of each word by taking the distance from the mean and 
	dividing by the standard deviation.
	Put the results back into the dictionary as a tuple along with the original 
	value. 
	Consider doing this with nested function(s), at least for looping through 
	the dictionary.
	While we are at it let's store some more data in a dictionary with unique 
	file names as key and some of the results of our calculations, for later 
	use.
	"""
	import math # needed for sqrt
	total_words = 0
	sum_of_squares = 0
	#print "## file = ",file #debug
	# calculate total number of words
	for key in file_words_dict:
		value = file_words_dict[key] # get count of occurrence in file 
		total_words += value # increment to get total 
	# calculate the mean, unique words will be same as lenght of dictionary
	number_unique_words = len(file_words_dict)
	# mean is total words divided by unique words 
	# make float for division to produce desired results
	mean = float(total_words) / float(number_unique_words)
	# calculate variance & standard deviation , must loop again because the 
	# calculations for each word requires knowing results from first loop
	for key in file_words_dict:
		value = file_words_dict[key] # get count again 
		distance = abs(value - mean) # distance from mean 
		square_of_difference = distance ** 2 # square difference from mean 
		sum_of_squares += square_of_difference # increment sum of squares 
	# calculate variance and standard deviation for file 
	variance = sum_of_squares / total_words
	standard_deviation = abs(math.sqrt(variance))
	# find z score, requires results from first two loops 
	for key in file_words_dict:
		# value is how many instances of unique word appear in file
		value = file_words_dict[key] 
		distance = abs(value - mean) # how far from mean
		z_score = distance / standard_deviation# calculate z-score
		new_value = (value,z_score) # add to dictionary as tuple with value
		file_words_dict[key] = new_value # store in dictionary
	# make a tuple to use as value when creating dictionary of file meta values 
	meta_values = (
		total_words,
		number_unique_words,
		mean,
		variance,
		standard_deviation
		)
	
	# make file name dictionary with meta values 
	make_file_name_dict(file,meta_values)
		
	return file_words_dict,file_name_dict
	
def make_file_name_dict(file,meta_values):
	""" Creates dictionary for key = file name and value = meta values 
	
	"""
	file_name_dict[file] = meta_values
	return file_name_dict


def pkl_ind_count(file_words_dict,file):
	""" Pickles the file_words_dict of individual files 
	and puts the dictionary of unique words with their count into a cPickle 
	file for later use. Ultimately this function may not be needed
	
	requires:
		import cPickle <module>
		passed file_words_dict <dictionary>
		passed file <string>
		local pf <string>
		local filename <string>
	returns: None
	
	"""
	pf = file[ 0 : file.find(".")]#removes extension if it exists
	filename = base_path + "/meta/Pcount_files/" + pf# v
	#sets n = meta dir with text file's name minus extension
	f = open(filename, "w")
	cPickle.dump(file_words_dict,f)
	f.close
	
def add_to_total_words_dict(file_words_list,file):
	"""Adds unique words in text file to dictionary of total words
	Add a valid word to the dictionary of total words, total_words_dict. If a 
	word is already in the dictionary include the count from the individual 
	file along with the name of the text file where it was just found. If the 
	word was not already in total_words_dict dictionary it will be added as a 
	new key value pair. This process will create the value as a tuple 
	consisting of the text file's name, the number of times that word appears 
	in that file and the word's z-score for that file. See tweak_file_words_dict
	for more info on z-score and other computed values.
	
	requires:
		passed: file_words_dict <dictionary>
		passed: file <string>
	returns:
		total_words_dict <dictionary>
	"""
	for entry in file_words_list:
		word = entry[0:1] [0]# get just word 
		count = entry[1][0] # get just count	
		z = entry[1][1]
		if total_words_dict.has_key(word):
			# yep, it was there so add new value 
			value = total_words_dict[word] # get previous value
			new_value = value + (file,count,z) # add on new value
			total_words_dict[word] = new_value # put it back in 
			#print "## TWD = ",total_words_dict[word]#debug
		else:
			#not there so make new key/value
			total_words_dict[word] = (file,count,z)
			oops = total_words_dict[word]
	# the key object of this module being returned to caller
	return total_words_dict

def pickle_obj(filename,obj):
	""" saves object to file by pickling 
	file = file where object is to be saved 
	filename = name of file
	dict = the dictionary to be saved 
		
	requires:
		import cPickle <module> 
		passed filename <string>
		passed obj <dictionary>
		local file <file>
	"""
	file = open( filename, "w" )
	cPickle.dump( obj, file )
	file.close()	
		
def store_in_shelve( obj, fn, fl ):
	""" put object in shelve 
	obj = object to shelve 
	fn = file name for shelve 
	fl = flagg for type of operation on file r,w, etc.
	db = what to shelve ???
	
	requires:
		passed obj <shelveable object>
		passed fn <string>
		passed fl <string> 
		local db <?>
	"""
	import shelve
	db = shelve.open( fn, fl )
	db = obj
#	db.sync()
	
def prompt_percentage():
	""" receives input for range of percentage 
	asks for two numbers in the range 0 to 100 and uses them 
	as the range in percentage to search for relevant words 
	NEEDS WORK should have better error and range handling 
	"""
	print "please enter two numbers in the range of 0 to 100 ",
	print "these will form the range in percentage in which to ",
	print "look for words relating documents. " 
	low_range = input("Please enter lower number 0 to 99: ")#imput low_range
	high_range = input("Please enter higher number 1 to 100: ")#input high_range
	print nl,"your range is ",low_range," to ",high_range," is this correct? "
	yn = raw_input("enter y for yes or n for no y/n: ")#input y or n
	yn = yn.lower()# make lowercase if not already
	if yn == "y":
		print "YES"#debug
		return low_range, high_range
	if yn == "n":
		print "NO"#debug
		low_range,high_range = 0,0
		prompt_percentage()
	return low_range,high_range

def list_count(list):
	"""Count the number of items in a list """
	count = len(list)
	return count
def build_pct_lst(dict,count):
	""" figure % files contain word  
	creates list, pct_lst, which contains a list of unique 
	words with the percentage of files that contain that 
	particular word. Does not contain list of files but that 
	can be retrieved from total_words_dict by using the word 
	as the key. NEEDS WORK but is ok as is (I think, maybe)
	"""
	list = dict.items()
	pct_lst= []
	for item in list:
		##print "item = ",item,"# ",len(item[0])#debug
		entry = item[0:1]
		word = str(entry[0])
		# this is sort of a bad way to do it, the length of item needs to be 
		# divided by three since each sequence contains three atoms 
		pct = ( (float( len(item[1]) )/3) / 2) / (float(count)) * 100
		pct_lst.append( ( word, pct ) )
	return pct_lst

def write_list(filename,list):
	""" Write a list to a text file 
	just writes a list, list to a file, filename  
	
	"""
	file = open(filename, "w")
	file.write(str(list))
	file.flush()
	#for item in list:
	#	print item
	print "length of list = ",len(list)

def show_range(list,low_range,high_range):
	""" Shows words in range 
	"""
	for item in list:
		if high_range > item[1] > low_range:
			word = str(item[0:1][0])
			pct = item[1]
			print word,"appears in ",round(pct,2),"% of documents"

def make_list_range(list,low_range,high_range):
	"""Creates list of unique words in range in % of documents containing them 
	"""
	list_range = []
	for item in list:		
		if high_range > item[1] > low_range:
			word = str(item[0:1][0])
			pct = item[1]
			list_range.append( ( word, pct ) )
	return list_range
	
def files_have_range_word(word,count_dict,file_name_dict):
	""" Make list of text documents containing a specified word
	count_dict will normally be total_words_dict and word will be a word that 
	appears within a percentage of documents in a specified range.
	Note: checking type was the easiest way to sort out the dictionary it was 
	a bit of a dirty way to do it, but it should get by for now.
	"""
	from types import *
	pos_lst = []
	neg_lst = []
	#print "\n## word = ",word # a word in range 
	value = count_dict[word] # gets entire value
	#print " value = ",value #debug
	for atom in value:
		if type(atom) == StringType:
			pos_lst.append(atom)
	for key in file_name_dict:
		if key not in pos_lst:
			neg_lst.append(key)
	return pos_lst,neg_lst


######################                BODY                ######################
print "\nHERE WE GO!..." #debug
ans = initalize() # holds answer for prompt 
#VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV
# next the unique words will go in dictionary, total_words_dict ~ a list of    #
# all the files is in all_file_list ~ dictionary of file names in the form of  #
# key = file name value = tuple with (total_words, number_unique_words,        #
# mean, variance, & standard_deviation) in that order                          #
total_words_dict, all_file_list,file_name_dict = create_basic_objects()
# next save total_words_dict as an object for use later 
pickle_obj( base_path + "/meta/" + "total_words_dict.obj",total_words_dict)
# now save all_file_list
pickle_obj( base_path + "/meta/" + "all_file_list.obj",all_file_list)
# the shelve might be better for large amounts of data and is here for testing 
store_in_shelve(total_words_dict,(base_path + "/meta/total_words"), "c")
# do the same with all_file_list 
store_in_shelve(all_file_list,(base_path + "/meta/all_file_list"), "c")
# count total number of files by finding length of list with all file names
num_all_files = list_count(all_file_list)
# make a list of words with percentage of documents containing that word
pct_lst = build_pct_lst(total_words_dict,num_all_files)	

## post pickling  ##
# we may need to make some local variables global as needed 
# range "x% to y%" is short cut for range "low_range to high_range" 
show_range(pct_lst,low_range,high_range) # display words in range x% to y% 
# now make a list of words in range x% to y% 
list_range = make_list_range(pct_lst,low_range,high_range)
## print "list_range = ",list_range #debug
# Eventually make next gaggle of code a function 
# Will loop through the words in range x% to y%  
pos_lst = [] # do it here for now 
neg_lst = [] # do it here for now 
range_dict = {} # do it here for now 
for item1 in list_range:
	#print"item = ", item1[1],item1[0]#debug
	wurd = item1[0]
	pos_lst,neg_lst = files_have_range_word(
		wurd,total_words_dict,file_name_dict)
	range_dict[item1] = (pos_lst, neg_lst),
	title = wurd.upper() + " "
	print "## Word = ",title * 7
	print "# pos_lst = ",pos_lst # debug
	print "neg_lst = ",neg_lst #debug 


print "ALL FINISHED, That's all folks!" #debug

######################                END                #######################