# tokenize.py - Let's look at another simple case of # splitting a string into tokens. This time, we want to # remove consecutive delimiters. # In this case we need to make use of Python's powerful # re or 'regular expression' library. import re s = "please...give..us some......cookies!!!! thank...you" # These two statements are the magic. They could be # combined into a single statement if you wish. First, # we create a tokenizer object, then use it to split the string. tok = re.compile("[.! ]") tokenlist = tok.split(s) print "\nTokenizing the string gives us:" print tokenlist # Now, the problem we have is that our list of tokens # includes a lot of empty strings. Let's remove them # from the list: use a list comprehension with an 'if'. tokenlist = [s for s in tokenlist if len(s) > 0] print "\nAfter removing empty tokens:" print tokenlist # ------------------------------------------------------------- # Next, let's read an input file and tokenize the words in it. # We will tokenize each line, and then add each line's list # of words into a global token list. # We can combine lists by adding them. inFile = open("jerusalem.txt", "r") wordlist = [] for line in inFile: linelist = re.compile("[.?!: \n]").split(line) linelist = [s for s in linelist if len(s) > 0] wordlist += linelist print "\nHere are the individual words from input file" print wordlist inFile.close()