# tokenize.py - Let's look at another simple case of
# splitting a string into tokens.  This time, we want to
# remove consecutive delimiters.
# In this case we need to make use of Python's powerful
#   re or 'regular expression' library.

import re

s = "please...give..us some......cookies!!!! thank...you"

# These two statements are the magic.  They could be
# combined into a single statement if you wish.  First,
# we create a tokenizer object, then use it to split the string.
tok = re.compile("[.! ]")
tokenlist = tok.split(s)

print "\nTokenizing the string gives us:"
print tokenlist

# Now, the problem we have is that our list of tokens
# includes a lot of empty strings.  Let's remove them
# from the list:  use a list comprehension with an 'if'.

tokenlist = [s for s in tokenlist if len(s) > 0]

print "\nAfter removing empty tokens:"
print tokenlist


# -------------------------------------------------------------
# Next, let's read an input file and tokenize the words in it.
# We will tokenize each line, and then add each line's list
# of words into a global token list.
# We can combine lists by adding them.

inFile = open("jerusalem.txt", "r")

wordlist = []
for line in inFile:
    linelist = re.compile("[.?!: \n]").split(line)
    linelist = [s for s in linelist if len(s) > 0]
    wordlist += linelist

print "\nHere are the individual words from input file"
print wordlist
inFile.close()