''' Version 2.0 Created on 28/05/2014, tested for Python 2.7 @authors: Juan Pablo Posadas, Grigori Sidorov Class for obtaining syntactic n-grams from dependency trees using Stanford parser output NOTE: Input tree should NOT be collapsed: -outputFormat "wordsAndTags, typedDependencies" -outputFormatOptions "basicDependencies" NOTE: Since ",", "[", "]", and "\" are part of our metalanguage, we add a slash to them when they are part of the sentence (sn-grams), e.g., "\,", "\[", "\\" This version allows to process separately nodes that have too many children (for a given threshold) and select min and max sizes of sn-grams. Default: 5, 2, 7 ''' from sets import Set import copy, sys import codecs class DepInfo(object): ''' This class represents the dependency information of a sentence ''' def __init__(self, lines): ''' Constructor ''' self.word = {} #Dictionary of original words according to their positions self.dep = {} #Dictionary with dependent words self.rel = {} #Dictionary with dependency relations self.children = {} #Dictionary with words that are dependent for a (key) word self.leaves = [] #List of indexes of words that are leaves self.root_idx = -1 #Index of the root self.prepare_indices(lines) def prepare_indices(self, lines): for idx, line in enumerate(lines): self.rel[idx + 1] = line[ : line.find("(")] line=line[line.find("("):] p_idx = int(line [line.find("-") + 1 : line.find(",",line.find("-"))] ) self.dep [idx + 1] = p_idx self.word[idx + 1] = line[line.rfind(", ") + 2 : line.rfind("-")] if self.word[idx+1] == ',': self.word[idx+1] = "\," elif self.word[idx+1] == '[': self.word[idx+1] = "\[" elif self.word[idx+1] == ']': self.word[idx+1] = "\]" elif self.word[idx+1] == '\\': self.word[idx+1] = "\\\\" self.children[p_idx] = self.children.get(p_idx, []) self.children[p_idx].append(idx + 1) if self.dep [idx + 1] == 0: self.root_idx = idx + 1 self.rel[idx+1] = "root"#Line added because of the FREELING output #Determine if a word is a leaf for i in self.word.keys(): if i not in self.children.keys(): self.leaves.append(i) class BiSNgrams(object): ''' classdocs ''' def __init__(self, min_size, max_size, max_num_children, option): ''' Constructor ''' self.min_size = min_size #The minimum size for the sn-grams self.max_size = max_size #The maximum size for the sn-grams self.max_num_children = max_num_children #The maximum number of children per node if option in [0,1,2]: self.option = option #Type of sn-grams to be obtained: 0 for WORD sn-grams; 1 for sn-grams of SR Tags ; 2 for both else: print "Invalid value for the parameter: option" exit(1) self.subtrees = [] #List that contains all the nodes that are not leaves self.DepNgrams = [] self.log = [] #List that contains the nodes that have more children than the parameter max_num_children #self.dicSRTags = {} #self.dicWordNgrams = {} self.dicSRTags = [] self.dicWordNgrams = [] for i in xrange(min_size, max_size+1): self.dicSRTags.append({}) self.dicWordNgrams.append({}) def reset_vars (self): del self.subtrees[:] del self.DepNgrams[:] del self.log[:] #self.dicWordNgrams.clear() #self.dicSRTags.clear() def print_parsed_sentence(self, sentence): line = "*****Sentence: " for i in sorted(sentence.word.keys()): line += sentence.word[i]+" " print line.rstrip(" ") ''' line = "" for i in sentence.word.keys(): line = str(i) + "\t" line += sentence.word[i] + "\t" line += sentence.rel [i] + "\t" line += str(sentence.dep [i]) + "\t" if i not in sentence.leaves: line += str(sentence.children[i]) print line print "Leaf nodes are:" line = "" for i in sentence.leaves: line += sentence.word[i] + ", " print line ''' def write_all_sn_grams (self, f2): ''' This method write in a file one kind of sn-gram according with the value of op ''' if self.option == 0 or self.option == 2: self.write_WordSngrams(f2) if self.option == 1 or self.option == 2: self.write_SRSngrams(f2) def write_WordSngrams(self,f2): f2.write("************sn-grams of words/POS tags:\n") for idx, dic in enumerate(self.dicWordNgrams): f2.write ("\n************Size: " + str(idx + self.min_size) + "\n") if len(dic.keys()) > 0: for item in dic.keys(): f2.write (item + "\t" + str(dic[item]) + "\n") else: f2.write("EMPTY\n") f2.write("\n") def write_SRSngrams(self,f2): f2.write("************sn-grams of tags of syntactic relations (SR tags):\n") for idx, dic in enumerate(result.dicSRTags): f2.write ("\n************Size: " + str(idx + self.min_size) + "\n") if len(dic.keys()) > 0: for item in dic.keys(): f2.write (item + "\t" + str(dic[item]) + "\n") else: f2.write("EMPTY\n") f2.write("\n") def print_sngrams(self): if self.option == 0 or self.option == 2: print "************sn-grams of words/POS tags:" for idx, dic in enumerate(self.dicWordNgrams): print "\n************Size: " + str(idx + self.min_size) if len(dic.keys()) > 0: for item in dic.keys(): print item + "\t"+str(dic[item]) else: print "EMPTY" print "****************************************" if self.option == 1 or self.option == 2: print "************sn-grams of tags of syntactic relations (SR tags):" for idx, dic in enumerate(self.dicSRTags): print "\n************Size: " + str(idx + self.min_size) if len(dic.keys()) > 0: for item in dic.keys(): print item + "\t"+str(dic[item]) else: print "EMPTY" print "****************************************" def process_sentence (self, lines): ''' This method calls the specific methods (general steps) for producing sn-grams according to the parameter "option" ''' self.reset_vars() sentence = DepInfo(lines) self.print_parsed_sentence(sentence) for i in sentence.word.keys(): if i in sentence.children.keys(): self.subtrees.append(i)#Store all the possible roots of the subtrees if self.option in [0,1,2]: if self.min_size >= 0: if self.max_size >= self.min_size: if self.min_size <= len(sentence.word): if self.max_size > len(sentence.word): line = "\tMessage: The value of the maximum size exceeds the length of the sentence.\n" #print line if self.min_size == 0 or self.max_size == 0: line = "\tMessage: the program will obtain the sn-grams of all possible sizes.\n" #print line log = self.get_all_DepNgrams(sentence) if len(log) > 0: line = "\tThe next words have more than "+ str(self.max_num_children) +" children:\n" print line for item in log: line = "\t\t"+sentence.word[item]+"\n" print line if self.option in [0,1]: self.store_all_DepNgrams(sentence, self.option) else: self.store_all_DepNgrams(sentence, 0) self.store_all_DepNgrams(sentence, 1) #self.show_sngrams(sentence, log) #This method only shows the sn-grams obtained from the sentence else: line = "\tERROR: The value of the minimum size exceeds the length of the sentence\n" # print line else: line = "\tERROR: The maximum size must be greater than the minimum size\n" print line else: line = "\tERROR: The value of the minimum size is not allowed\n" print line else: line = "\tERROR: Invalid value for the parameter option\n" print line def prepare_SNgram(self, line, sentence, op): ''' op=-1 for sn-grams of index op=0 for sn-grams of words op=1 for sn-grams of sr tags ''' ngram = "" for item in line: if type(item) is str: ngram += item elif type(item) is int: if op == -1: ngram += str(item) elif op == 0: ngram += sentence.word[item] else: ngram += sentence.rel[item] else: ngram += self.prepare_SNgram(item, sentence, op) return ngram ###################### def is_continuous(self, ngram): ''' This method tests if a sn-gram is continuous or not. It assumes that no punctuation characters are allowed in the sn-gram. Used for testing. ''' answer = "" if ngram.count(",") > 0: answer = "NO" else: answer = "YES" return answer ###################### def len_Ngram(self, ngram): n = 1 n += ngram.count("[") n += ngram.count(",") n -= ngram.count("\[") n -= ngram.count("\,") return n ######################## def get_all_DepNgrams(self, sentence): ''' This method begins the process of getting all the sn-grams of the dependency tree ''' unigrams = [] #Auxiliar variable that contains all the unigrams combinations = [] #Auxiliar variable that contains all the combinations of a node with its children aux = [] log = Set() if sentence.root_idx > 0: unigrams, combinations, log = self.get_subtrees (sentence)#Call this method first for obtaining all the posible subtrees if len(unigrams) > 0: self.DepNgrams.append([sentence.root_idx]) self.DepNgrams.extend(unigrams) #Adds the unigrams to the general container for item in combinations: #Adds the first sn-grams to the general container if self.min_size != 0 or self.max_size != 0: size = self.len_Ngram(self.prepare_SNgram(item[0], sentence, -1)) if size >= self.min_size and size <= self.max_size: #Check the size of the new sn-grams self.DepNgrams.append(copy.deepcopy(item[0])) if size < self.max_size: aux.append(item) else: self.DepNgrams.append(copy.deepcopy(item[0])) if self.min_size != 0 or self.max_size != 0: self.compound_sngrams(aux, sentence) #This function generates the rest of sn-grams else: self.compound_sngrams(combinations, sentence) else: line = "\tError, no root found\n" print line return(log) ###################### def store_all_DepNgrams(self, sentence, op): ''' This method stores the sn-grams in the container specified by the parameter "op" ''' for item in self.DepNgrams: ngram = self.prepare_SNgram (item, sentence, op) if op == 0: #According to the params, the sn-grams are stored in the container size = self.len_Ngram(ngram) dic = self.dicWordNgrams[size-self.min_size] if dic.has_key(ngram): #Update the dictionary of Words sn-grams dic[ngram] += 1 #If the sn-gram exists in the dictionary, update its frequency else: dic[ngram] = 1 #Otherwise, add the sn-gram to the dictionary else: size = self.len_Ngram(ngram) dic = self.dicSRTags[size-self.min_size] if dic.has_key(ngram): #Update the dictionary of sn-grams of SR Tags dic[ngram] += 1 #If the sn-gram exists in the dictionary, update its frequency else: dic[ngram] = 1 #Otherwise, add the sn-gram to the dictionary ###################### def compound_sngrams(self, original, sentence): combinations = [] candidates = [] for combination in original: #This cycle initializes the list of combinations and list of candidates if len(combination[1]) > 0: size = self.len_Ngram(self.prepare_SNgram(combination[0], sentence, -1)) combinations.append([combination[0],combination[1],size]) if combination[0][0] != sentence.root_idx: size = self.len_Ngram(self.prepare_SNgram(combination[0], sentence, -1)) candidates.append([combination[0],combination[1],size]) while len(candidates) > 0: #In this cycle, select a sn-gram to be replaced in the rest of combinations candidate = candidates.pop(0) value = candidate[0][0] #Get the first number of the first candidate sn-gram for combination in combinations: if value in combination[1]: position = combination[0].index(value,2)#First get the position of the element sngram = copy.deepcopy(combination) sngram[0].pop(position) #Delete the element in the sn-gram sngram[0].insert(position,candidate[0]) #Insert the new part into the sn-gram sngram[1].remove(value) #Update its list of posible combinations sngram[2] = self.len_Ngram(self.prepare_SNgram(sngram[0], sentence, -1))#Obtain the size of the new sngram if (self.min_size > 0) and (self.max_size > 0): #Case when the user specifies the max and min size of sn-grams if sngram[2] in xrange(self.min_size, self.max_size+1):#Case when the sn-grams from the list substitution have to be inserted self.DepNgrams.append(copy.deepcopy(sngram[0])) #Update the list of all sn-grams if sngram[2] < self.max_size: if sngram[0][0] == sentence.root_idx: if len(sngram[1]) > 0: combinations.append(copy.deepcopy(sngram)) else: if len(sngram[1]) > 0: combinations.append(copy.deepcopy(sngram)) candidates.append(copy.deepcopy(sngram)) else: #Case when there is no restriction on the size of the sn-grams self.DepNgrams.append(copy.deepcopy(sngram[0])) #Update the list of all sn-grams if sngram[0][0] == sentence.root_idx: if len(sngram[1]) > 0: combinations.append(copy.deepcopy(sngram)) else: if len(sngram[1]) > 0: combinations.append(copy.deepcopy(sngram)) candidates.append(copy.deepcopy(sngram)) ###################### def get_subtrees (self, sentence): # A function that gets all the possible subtrees in the tree unigrams = [] #List of all possible unigrams combinations = [] #List of all possible combinations of nodes and their children counter = 0 #Counts the number of children inserted in the aux list aux = [] #Auxiliar variable that contains the highest number of children allowed log = Set() #Variable that contains IDs of the nodes that have more children than it is allowed for node in self.subtrees: if self.max_num_children != 0: aux = [] #Reset the container for the next iteration counter = 0 #Reset the variable for the next iteration for child in sentence.children[node]: if self.min_size == 1 or self.min_size == 0 or self.max_size == 0: #This code obtains all unigrams of the sentence unigrams.append ([child]) aux.append(child) counter += 1 if counter > self.max_num_children: aux.pop() combinations.extend(self.get_next_combinations(node, aux, sentence))#We save new sn-grams in the global dictionary counter = 0 aux = [] aux.append(child) log.add(node) if len(aux) > 0: #Analyze the rest of the children combinations.extend(self.get_next_combinations(node, aux, sentence)) #We save new sn-grams in the global dictionary else: #In this case, there is no limitation on the number of children per node, so all the children are processed combinations.extend(self.get_next_combinations(node, sentence.children[node], sentence)) for child in sentence.children[node]: if self.min_size == 1 or self.min_size == 0 or self.max_size == 0:#This code obtains all unigrams of the sentence unigrams.append ([child]) return (unigrams, combinations, log) ###################### def get_next_combinations (self, value, children, sentence): ngram = [] #Auxiliary variable for storing the sn-gram options = [] #Auxiliary variable for storing the all the nodes that can be changed in a sn-gram combinations = [] #Auxiliary variable for generating a combination lista = [] #Auxiliary variable for all sn-grams during analysis of a sub-tree #Initialize the list of combinations for p in xrange(0, len(children)): combinations.append (0) #Generate sn-grams for r in xrange (1, len(children) + 1): for j in xrange (1, r + 1): combinations [j - 1] = j - 1 #################### The first combination options = [] ngram = [] ngram.append (value) ngram.append ("[") for z in xrange (0, r): ngram.append(children [combinations [z]]) if children[combinations[z]] not in sentence.leaves: options.append(children [combinations [z]]) ngram.append (",") ngram.pop (len(ngram) - 1) ngram.append ("]") lista.append (copy.deepcopy([ngram,options])) ################### The rest top = self.Combination (len(children), r) for j in xrange(2, top + 1): m = r val_max = len(children) while combinations [m - 1] + 1 == val_max: m -= 1 val_max -= 1 combinations [m - 1] += 1 for k in xrange (m + 1, r + 1): combinations [k - 1] = combinations [k - 2] + 1 options = [] ngram = [] ngram.append(value) ngram.append("[") for z in xrange(0, r): ngram.append (children [combinations [z]]) if children[combinations[z]] not in sentence.leaves: options.append(children [combinations [z]]) ngram.append (",") ngram.pop (len(ngram) - 1) ngram.append ("]") lista.append (copy.deepcopy([ngram,options])) return (lista) ###################### def Combination (self, sz, r): if sz == r: numerator = 1 else: numerator = sz for i in xrange (1, sz): numerator *= sz - i aux = r for i in xrange (1, r): aux *= r - i divisor = sz - r for i in xrange (1, sz - r): divisor *= sz - r - i numerator = numerator / (aux * divisor) return (numerator) ############ def process_one_sentence (result, sent_num, f2): print "Sentence " + str(sent_num) result.process_sentence (lines) #result.print_sngrams() result.write_all_sn_grams (f2) return sent_num + 1 ############### MAIN ################################ if __name__ == '__main__': if len(sys.argv) < 3: print "Usage with at least two parameters:" print "python SNGrams_selective.py input output" exit(1) if len(sys.argv) > 7 or (len(sys.argv) < 7 and len(sys.argv) > 3): print "Usage with at least six parameters:" print "python SNGrams_selective.py input output min_size max_size max_num_children option" exit(1) input_file = sys.argv[1] output_file = sys.argv[2] #############These are parameters of configuration for the class BiSNgrams min_size = 2 #These are the parameters of configuration for the class BiSNgrams max_size = 7 max_num_children = 5 option = 0 #Type of sn-grams to be obtained: 0 for WORD sn-grams; 1 for sn-grams of SR Tags; 2 for both if len(sys.argv) > 3: min_size = int(sys.argv[3]) #These are the parameters of configuration for the class BiSNgrams max_size = int(sys.argv[4]) max_num_children = int(sys.argv[5]) option = int(sys.argv[6]) #value 0: for sn-grams of words; #value 1: for sn-grams of sr tags; #value 2: for sn-grams of words and sr tags (equal to call with option 0 and then with option 1) encod = 'utf-8' #'utf-8' or other encoding like '1252' try: f1 = codecs.open (input_file, "rU", encoding = encod) except IOError as e: print "I/O error({0}): {1}".format(e.errno, e.strerror) exit(1) try: f2 = codecs.open (output_file, "wb", encoding = encod) #b - Binary, for Unix line endings except IOError as e: print "I/O error({0}): {1}".format(e.errno, e.strerror) exit(1) sent_num = 1; result = BiSNgrams(min_size, max_size, max_num_children, option) lines = [] while True : ln = f1.readline () if (not ln) or (ln == ""): break; ln = ln.strip() if ln == "": #Sentences are separated by EMPTY line if len (lines) > 0: sent_num = process_one_sentence (result, sent_num, f2) del lines [:] else: lines.append (ln) if len(lines) > 0: #Last piece in previous (while) sent_num = process_one_sentence (result, sent_num, f2) f1.close () f2.close () print "Done."