# Script qui tokenise une liste de textes en entree
import nltk,re,sys
from nltk import tokenize

pattern = r'\w+'

#sentence = "This is the time -- and this is the record of the time."
#words = sentence.split()
for arg in sys.argv[1:]:
  f = open(arg, 'rU')
  text = f.read()
  print arg + ':'
  pattern = r'\w+'
  print list(nltk.tokenize.regexp_tokenize(text, pattern))
  f.close()