#!/usr/bin/python
""" 
This module is designed to count words in text documents then record
the results to a file named as the original file but with a .count suffix
appended as the extension. The total word count of all those files will
be put into a global word count file.
"""
## initialization import some modules ##################
import string, os, sys, re, copy
# some handy strings to use as variables 
nl, tb, sp, lb, hy, null = "\n", "\t", " ", "#", "-", ""
# list of acceptable single character non alpha words
symbol_words = ['&']
# internal word symbols that are acceptable
# sets the base directory where all of this will live 
base_path = "/home/kurt/bin/WordsCountProject/SandBox/TestRewrite/"
# sub directory that contains the text files to be parsed it will be under the
# base path, base_path
text_dir = "text/"
# reserved directory in which to stash meta data
meta_dir = "meta/"
# place to put count files it will be under base path, base_path
count_dir = "text-count/"
# global dictionary for all unique words with the total count in all files
global_dict = {}
# characters to be removed from front or end of word
char = re.compile(r"""[\s?"()\[\]]+""")
################################################
	
def read_file(file):
	""" reads a single file in one big chunk
	reads an individual file designated by the global variable (path)
	combined with the file name passed by the calling statemnetor 
	function. At this time it reads in the entire file in one chunk but if
	real large files are to be expected this needs to be changed. The 
	content of the file is placed in the variable(content) and returned
	to the calling statement or function.
	
	depends:
		global: path <string>
		passed: file <string>
	returns:
		content <string>
	"""
	file = open(path + file, 'r')
	content = file.read()#read entire contents of file into string = content
	return content
	
def split_content(content):
	""" splits content string into words list 
	the resulting words may need to receive further 
	work 	the re module must be imported before this will 
	work. The split is currently on whitespace and a few extra
	characters. It may be improved but will probably need 
	furthur processing later anyway.
	
	depends:
		import: re <module>
		global: char <string>
		passed: content <string>
	returns: words <list>
	"""
	words = re.split(char, string.lower(content[:-1]))# split content into words
	#print "## length of words = ",len(words),sp,type(words)#debug line
	#print "file_words_dict = ",file_words_dict#debug line
	#print "words =",words#debug line
	return words

def fix_words(words):
	""" repairs, furthur refines, and selects words *STILL NEEDS WORK*
	This function takes a word list and loops through 
	it to select, reject, or refine each word.as it now 
	stands this function passes not enough words that 
	need to fixed up before being put into the list of
	acceptable good words.
	
	depends:
		import: string <module>
		global: symbol_words <string>
		local: word <string>
		passed: words <list>
	returns: good_words <list>
"""
	good_words = [] #start new fresh list
	for word in words:
		if word == "":
			pass
		elif word.isalpha() or word.isdigit():
			good_words.append(word)
		elif len(word) == 1 and  word in symbol_words:
			good_words.append(word)#add to dictionary
		elif  word.startswith(r"'") and word.endswith(r"'"):
			#print "here we are and word = ",word#debug line
			word = word[1:(len(word)-1)] #strip single quotes on start and end
			#print "here we are now word = ",word#debug line
			good_words.append(word)#ok for now but maybe should recheck these
		elif word.count(r"'",1,) or word.count("-",1,len(word)-1):
			good_words.append(word)#add to dictionary
		else:
			#print"is not an acceptable word\n"
			pass
			
	return good_words
	
def count_words(good_words):
	"""count  words in current file then put words & count in dictionary
	The list of good acceptable words are taken one at a time 
	if the word is not already in the dictionary the count is 
	set to 1 and if the word is already in the dictionary 
	the 	value already associated with that word is 
	incremented by 1. Before, in or after this function is 
	called might be a good place to add more info to the 
	dictionary.
	
	depends:
		local: word <string>
		passed: good_words <list>
	returns: file_words_dict <dictionary>
	"""
	file_words_dict = {} # to contain all the unique words in file with a count 
	#print "##in start of count_words file_words_dict = ",file_words_dict
	for word in good_words:
		if file_words_dict.has_key(word):
			#print "word = ",word," and vfile_words_dict[word] ----> ",file_words_dict[word]
			file_words_dict[word]  += 1
		else:
			file_words_dict[word] = 1

	##print "#### now file_words_dict = \n",file_words_dict
	return file_words_dict

def write_count_file(file_words_dict,file):
	""" writes file's word count to related file
	Creates the file containing each acceptable word that 
	was in the current file. Designed to write first the 
	word along with its count on a single line followed 
	by a newline character to separate each word value pair.
	
	depends:
		import: os <module>
		import: string <module>
		global: base_path <string>
		global: count_dir <string>
		local: file_words_list <list>
		local: entry
		local: s <string>
		local: count_file <string>
		local: word_count_list <list>
		passed: file_words_dict <dictionary>
		passed: file <string>
	returns:
		file_words_list <list>
	
	"""
	#convert file_words_dict to list then sort
	file_words_list = file_words_dict.items()
	file_words_list.sort()
	#print "list of words (file_words_list) to be put in file = \n",file_words_list
	# now let's save the sorted word list to a file 
	count_file = base_path + count_dir + file + ".count"
	#print "word_dict = ",word_dict #debug
	#print results to count file file
	word_count_list = open(count_file, "w")
	for entry in file_words_list:
		for s in entry:
			print >> word_count_list, s, # write word and count inner loop
		print >>word_count_list, "\n", # write new line outer loop
	word_count_list.close()
	return file_words_list
def add_to_total_dict(file_words_list,file):
	"""add valid word to dictionary of total words
	still needs to be written
	depends:
		passed: file_words_dict <dictionary>
		passed: file <string>
	returns:
		total_words_dict <dictionary>
	"""
	
	for entry in file_words_list:
		word = entry[0:1]
		count = entry[1:2]
		print "entry = ",entry,sp,type(entry)
		print "word = ",word,sp,type(word)
		print "count = ",count,sp,type(count)
		if total_words_dict.has_key(word):
			print "yep, it was there!"
		else:
			print "nope not there!"
			#total_words_dict[entry]
			
	print "## WOW! total_words_dict = \n",total_words_dict

def write_total(total_words_dict):
	""" write the total words in a list 
	still needs to be written
	
	"""
	pass

## end functions - start higher level logic ##
print "\nHere we go..."
## find a better place for initiating file_words_dict

total_words_dict = {} #will contain all the words in all files
path = base_path + text_dir
for file in os.listdir(path):
	if os.path.isfile(path  + file):
		# do all this stuff if file is a file not something else
		print"\nthis file is ##",file #debug line
		content = read_file(file)# get raw contents of file (file)
		words = split_content(content)# split raw contents into basic words
		good_words = fix_words(words)# turn raw words into list of good words 
		file_words_dict = count_words(good_words)# make file_words_dict from good words		
		file_words_list = write_count_file(file_words_dict,file)# write a .count file for current file
		#print "## file_words_list =\n",file_words_list#debug line
		total_words_dict = add_to_total_dict(file_words_list,file)# combine total dict with local dict
		
		#total_words_dict.update(file_words_dict) #merge dictionaries ignore value
		
	else:
		pass
# now save the total word list
#total_words_list = total_words_dict.keys()

## END ##
print "That's all folks!"