''' Created on 23/03/2014, tested for Python 2.7 @authors: Juan Pablo Posadas, Grigori Sidorov Class for obtaining syntactic n-grams from dependency trees using Stanford parser output NOTE: Input tree should NOT be collapsed: -outputFormat "wordsAndTags, typedDependencies" -outputFormatOptions "basicDependencies" NOTE: Since ",", "[", "]", and "\" are part of our metalanguage, we add a slash to them when they are part of the sentence (sn-grams), e.g., "\,", "\[", "\\" ''' import copy, sys import codecs class SNgrams(object): ''' Class for obtaining syntactic n-grams from dependency trees using Stanford parser output Both continuous and non-continuous n-grams ''' def __init__(self): self.words = {} #Dictionary of original words according to their positions self.deps = {} #Dictionary with dependent words self.rels = {} #Dictionary with dependency relations self.children = {} #Dictionary with words that are dependent for a (key) word self.leafs = [] #List of indexes of words that are leaves self.root_idx = 0 #Index of the root self.DepNgrams = [] #Auxiliar conatiner of the sngrams self.listDep = [] #List of n-grams of words organized by size self.listSRTags = [] #List of n-grams of SR tags organized by size self.listContDep = [] #List of CONTINUOS n-grams of words organized by size self.listContSRTags = [] #List of CONTINUOS n-gram of SR tags organized by size ###################### def reset_vars (self): self.words .clear() self.deps .clear() self.rels .clear() self.children.clear() del self.leafs [:] self.root_idx = 0 del self.DepNgrams [:] del self.listDep [:] del self.listSRTags [:] del self.listContDep [:] del self.listContSRTags [:] ###################### def process_sentence (self, lines, option): ''' This method call the specific methods (general steps) for producing sngrams according to the specific parameter "option" ''' self.reset_vars() self.prepare_indices (lines) self.print_parsed_sentence () if option in [0,1,2]: self.get_all_DepNgrams() if option in [0,1]: self.store_all_DepNgrams(option) else: self.store_all_DepNgrams(0) self.store_all_DepNgrams(1) elif option in [3,4,5]: max_height = self.get_cont_DepNgrams() if option in [3,4]: self.store_cont_DepNgrams(max_height,option) else: self.store_cont_DepNgrams(max_height,3) self.store_cont_DepNgrams(max_height,4) else: print "Invalid option." ###################### def prepare_indices(self, lines): for idx, line in enumerate(lines): self.rels[idx + 1] = line[ : line.find("(")] line=line[line.find("("):] p_idx = int(line [line.find("-") + 1 : line.find(",",line.find("-"))] ) self.deps [idx + 1] = p_idx self.words[idx + 1] = line[line.find(",") + 2 : line.rfind("-")] if self.words[idx+1] == ',': self.words[idx+1] = "\," elif self.words[idx+1] == '[': self.words[idx+1] = "\[" elif self.words[idx+1] == ']': self.words[idx+1] = "\]" elif self.words[idx+1] == '\\': self.words[idx+1] = "\\\\" self.children[p_idx] = self.children.get(p_idx, []) self.children[p_idx].append(idx + 1) if self.deps [idx + 1] == 0: self.root_idx = idx + 1 self.rels[idx+1] = "root"#Line added because of the FREELING output #Determine if a word is a leaf for i in self.words.keys(): if i not in self.children.keys(): self.leafs.append(i) #Initialize container for n-grams for i in xrange(0, len(lines)): self.listSRTags.append([]) self.listDep .append([]) ###################### def print_parsed_sentence(self): line = "" for i in sorted(self.words.keys()): line += self.words[i]+" " print line.rstrip(" ") line = "" for i in self.words.keys(): line = str(i) + "\t" line += self.words[i] + "\t" line += self.rels [i] + "\t" line += str(self.deps [i]) + "\t" if i not in self.leafs: line += str(self.children[i]) print line print "Leaf nodes are:" line = "" for i in self.leafs: line += self.words[i] + ", " print line ################################# def get_cont_DepNgrams(self): ''' Method that begins the look for continuous sngrams ''' del self.DepNgrams [:] path = [] #Auxiliary container for all the posible paths max_height =0 self.DepNgrams.append([self.root_idx]) path.append(self.root_idx) max_height = self.traverse_pathDep(self.root_idx,path,max_height) return (max_height) def store_cont_DepNgrams(self,max_height,op): ''' Method that translate the id of the index of the word into the SR tag or word ''' if op == 3:#Case focus on continuous word sngrams for i in xrange(0,max_height):#Initialize the final container of the sngrams self.listContDep.append([]) for sngram in self.DepNgrams:#Clasify the sngram by its size line = "" for item in sngram:#Translate index into word line += self.words[item] + " " line = line.rstrip(" ") self.listContDep[len(sngram)-1].append(line) elif op == 4: for i in xrange(0,max_height):#Initialize the final container of the sngrams self.listContSRTags.append([]) for sngram in self.DepNgrams:#Clasify the sngram by its size line = "" for item in sngram: line += self.rels[item] + " " line = line.rstrip(" ") self.listContSRTags[len(sngram)-1].append(line) ###################### def traverse_pathDep(self,idx,path,max_height): ''' Method that builds all the posible paths of the tree ''' if idx not in self.leafs: for child in self.children[idx]: path.append(child) for tam in xrange(0,len(path)):#Size of posible sngrams that can be obtained from the path sngram = [] for i in xrange(tam,len(path)): sngram.append(path[i])#Generate the posible sngrams acording with the size self.DepNgrams.append(copy.deepcopy(sngram)) #Update the dictionary with the new sngram size=len(sngram) if size > max_height: max_height = size max_height = self.traverse_pathDep(child,path,max_height) path.pop()#Erase the last node that was added to the path return (max_height) ###################### def prepare_SNgram(self, line,op): ngram = "" for item in line: if type(item) is str: ngram += item elif type(item) is int: if op == 0: ngram += self.words[item] else: ngram += self.rels[item] else: ngram += self.prepare_SNgram(item,op) return ngram ###################### def len_Ngram(self, ngram): n = 1 n += ngram.count("[") n += ngram.count(",") n -= ngram.count("\[") n -= ngram.count("\,") return n ######################## def get_all_DepNgrams(self): ''' This method begins the process of getting all the sngrams of the dependency tree ''' del self.DepNgrams [:] if self.root_idx > 0: self.DepNgrams.append (copy.deepcopy([self.root_idx]))#Add the firs sngram founded (root element) self.traverse_DepTree (self.root_idx)#Call the method in order to get the rest of sngrams else: print "Error, no root found" def store_all_DepNgrams(self,op): ''' This method stores the sngram in the container specified by the parameter op ''' for item in self.DepNgrams: ngram = self.prepare_SNgram (item,op) size = self.len_Ngram (ngram) if op == 0:#Acording with the params, the sngrams are stored in the container self.listDep[size - 1].append (ngram) elif op == 1: self.listSRTags[size - 1].append (ngram) ###################### def traverse_DepTree (self, idx): # Recursive function that traverses the syntactic tree and generates syntactic n-grams sig = [] # List of sn-grams for the current node combinations = [] # List of all possible combinations of nodes and their children children = self.children [idx] #First sn-grams in a subtree for child in children: sig = [] sig.append (child) self.DepNgrams.append (copy.deepcopy(sig)) combinations = copy.deepcopy(self.get_next_combinations(idx, children)) self.DepNgrams.extend(copy.deepcopy(combinations)) # We save new sn-grams in the global dictionary for num, child in enumerate(children): if child in self.leafs: if num + 1 == len(children): return(combinations) else: sig = [] sig = self.traverse_DepTree (child) while len(sig) > 0: temp = [] ngram = sig.pop(0) for idx2 in xrange(0, len(combinations)): aux = copy.deepcopy (combinations [idx2]) for idx, item in enumerate(aux): if item == ngram[0]: aux.pop (idx) aux.insert (idx, copy.deepcopy (ngram)) self.DepNgrams.append (copy.deepcopy (aux)) temp.append (copy.deepcopy (aux)) if len(temp) > 0: combinations.extend (copy.deepcopy(temp)) return(combinations) ###################### def get_next_combinations (self, value, children): ngram = [] combinations = [] # Auxiliary variable for generating a combination lista = [] # Auxiliary variable for all sn-grams during analysis of a sub-tree # Initialize the list of combinations for p in xrange(0, len(children)): combinations.append (0) #Generating sn-grams for r in xrange (1, len(children) + 1): for j in xrange (1, r + 1): combinations [j - 1] = j - 1 #################### The first combination ngram = [] ngram.append (value) ngram.append ("[") for z in xrange (0, r): ngram.append(children [combinations [z]]) ngram.append (",") ngram.pop (len(ngram) - 1) ngram.append ("]") lista.append (copy.deepcopy(ngram)) ################### The rest top = self.Combination (len(children), r) for j in xrange(2, top + 1): m = r val_max = len(children) while combinations [m - 1] + 1 == val_max: m -= 1 val_max -= 1 combinations [m - 1] += 1 for k in xrange (m + 1, r + 1): combinations [k - 1] = combinations [k - 2] + 1 ngram = [] ngram.append(value) ngram.append("[") for z in xrange(0, r): ngram.append (children [combinations [z]]) ngram.append (",") ngram.pop (len(ngram) - 1) ngram.append ("]") lista.append (copy.deepcopy(ngram)) return (lista) ###################### def Combination (self, sz, r): if sz == r: numerator = 1 else: numerator = sz for i in xrange (1, sz): numerator *= sz - i aux = r for i in xrange (1, r): aux *= r - i divisor = sz - r for i in xrange (1, sz - r): divisor *= sz - r - i numerator = numerator / (aux * divisor) return (numerator) ###################### def print_all_sn_grams (self,option): #This version of the method just print one kind of sngrams if option == 0: self.print_Words_sngrams() elif option == 1: self.print_SR_sngrams() elif option == 2: self.print_Words_sngrams() self.print_SR_sngrams() elif option == 3: self.print_cont_Words_sngrams() elif option == 4: self.print_cont_SR_sngrams() elif option == 5: self.print_cont_Words_sngrams() self.print_cont_SR_sngrams() def print_Words_sngrams(self): print "************SNgrams of words/POS tags:" for idx, sn_list in enumerate(result.listDep): print "************Size: " + str(idx + 1) for item in sn_list: print item def print_SR_sngrams(self): print "************SNgrams of tags of syntactic relations (SR tags):" for idx, sn_list in enumerate(result.listSRTags): print "************Size: " + str(idx + 1) for item in sn_list: print item def print_cont_Words_sngrams(self): print "************Continuous SNgrams of words/POS tags:" for idx, sn_list in enumerate(result.listContDep): print "************Size: " + str(idx + 1) for item in sn_list: print item def print_cont_SR_sngrams(self): print "************Continuous SNgrams of tags of syntactic relations (SR tags):" for idx, sn_list in enumerate(result.listContSRTags): print "************Size: " + str(idx + 1) for item in sn_list: print item ###################### def write_all_sn_grams (self, f2, option): ''' This method write in a file one kind of sngram acording with the value of op ''' if option == 0: self.write_WordSngrams(f2) elif option == 1: self.write_SRSngrams(f2) elif option == 2: self.write_WordSngrams(f2) self.write_SRSngrams(f2) elif option == 3: self.write_cont_WordSngrams(f2) elif option == 4: self.write_cont_SRSngrams(f2) elif option == 5: self.write_cont_WordSngrams(f2) self.write_cont_SRSngrams(f2) def write_WordSngrams(self,f2): f2.write("************SNgrams of words/POS tags:\n") for idx, sn_list in enumerate(result.listDep): f2.write ("************Size: " + str(idx + 1) + "\n") for item in sn_list: f2.write (item + "\n") f2.write("\n") def write_SRSngrams(self,f2): f2.write("************SNgrams of tags of syntactic relations (SR tags):\n") for idx, sn_list in enumerate(result.listSRTags): f2.write ("************Size: " + str(idx + 1) + "\n") for item in sn_list: f2.write (item + "\n") f2.write("\n") def write_cont_WordSngrams(self,f2): f2.write("************Continuous SNgrams of words/POS tags:\n") for idx, sn_list in enumerate(result.listContDep): f2.write ("************Size: " + str(idx + 1) + "\n") for item in sn_list: f2.write (item + "\n") f2.write("\n") def write_cont_SRSngrams(self, f2): f2.write("************Continuous SNgrams of tags of syntactic relations (SR tags):\n") for idx, sn_list in enumerate(result.listContSRTags): f2.write ("************Size: " + str(idx + 1) + "\n") for item in sn_list: f2.write (item + "\n") f2.write("\n") ############ def process_one_sentence (result, sent_num, f2, option): print "Sentence " + str(sent_num) result.process_sentence (lines, option) result.print_all_sn_grams (option) result.write_all_sn_grams (f2, option) return sent_num + 1 ############### MAIN ################################ if __name__ == '__main__': if len(sys.argv) < 3: print "Usage with at least two parameters:" print "python SNGrams.py input output" exit(1) fname_in = sys.argv[1] #'input.txt' fname_out = sys.argv[2] #'output.txt' option = 0 if len(sys.argv) > 3: option = int(sys.argv [3]) # value 0: for sngrams of words; # value 1: for sngrams of sr tags; # value 2: for sngrams of word and sr tags (equal to call with option 0 and then with option 1) # value 3: for continuous sngrams of words; # value 4: for continuous sngrams of sr tags; # value 5: for continuous sngrams of words and sr tags (equal to call with option 3 and then with option 4) encod = 'utf-8' # 'utf-8' or '1252' print "You are assuming enconding: " + encod try: f1 = codecs.open (fname_in, "rU", encoding = encod) except IOError as e: print "I/O error({0}): {1}".format(e.errno, e.strerror) exit(1) try: f2 = codecs.open (fname_out, "wb", encoding = encod) # b - Binary, for Unix line endings except IOError as e: print "I/O error({0}): {1}".format(e.errno, e.strerror) exit(1) sent_num = 1; result = SNgrams() lines = [] while True : ln = f1.readline () if (not ln) or (ln == ""): break; ln = ln.strip() if ln == "": # Sentences are separated by EMPTY line if len (lines) > 0: sent_num = process_one_sentence (result, sent_num, f2, option) del lines [:] else: lines.append (ln) if len(lines) > 0: # Last piece sent_num = process_one_sentence (result, sent_num, f2, option) f1.close () f2.close ()