'''
Version 4.1
Created on 11/12/2014, tested for Python 2.7
Update on 9/05/2016  (mixed sn-grams supported)
Update on 15/04/2020 (bug with dash in words fixed)
@authors: Juan Pablo Posadas, Grigori Sidorov
Class for obtaining syntactic n-grams from dependency trees using Stanford parser output
This version supports continuos, non continuos sn-grams and without stop words  or content sn-grams. It also supports mixed sn-grams (first element of one type and the rest of elements of different type)
NOTE: Input tree should NOT be collapsed: -outputFormat "wordsAndTags, typedDependencies" -outputFormatOptions "basicDependencies"
NOTE: Since ",", "[", "]", and "\" are part of our metalanguage, we add a slash to them when they are part of the sentence (sn-grams), e.g., "\,", "\[", "\\"
This version allows to process separately nodes that have too many children (for a given threshold) and select min and max sizes of sn-grams. Default: 5, 2, 7
Ensure not to use input symbols that are not UTF-8.
'''
from sets import Set
import copy, sys,re
import codecs

class ReduceDepInfo(object):
    '''
    This class obtains a reduced version of the original syntactic tree, eliminating the stop words
    '''
    def __init__(self, obj, dictionary, has_POS):
        '''
        Constructor
        '''
        self.word          = copy.deepcopy(obj.word) #Dictionary of original words according to their positions
        self.pos           = copy.deepcopy(obj.pos) #Dictionary with POS tags of words                
        self.dep           = copy.deepcopy(obj.dep) #Dictionary with the index of the father
        self.rel           = copy.deepcopy(obj.rel) #Dictionary with dependency relations 
        self.children      = copy.deepcopy(obj.children) #Dictionary with words that are dependent for a (key) word
        self.leaves        = [] #List of indexes of words that are leaves        
        self.root_idx      = obj.root_idx #Index of the root

        self.drop_nodes(dictionary, obj, has_POS)
    ########################################################################

    def drop_nodes(self, dictionary, obj, has_POS):                
        nodes = copy.deepcopy(obj.word.keys()) #List of indexes of the sentece elements without the root
        nodes.remove(obj.root_idx)        
        for node in nodes:
            if self.word[node].lower() in dictionary:#Case where we have to remove the node
                father = self.dep[node]                
                relation = self.rel[node]
                if self.children.has_key(node):                
                    for child in self.children[node]:
                        self.children[father].append(child)#Update the relations between nodes
                        self.rel[child] = relation + ";"+ self.rel[child]
                        self.dep[child] = father
                    self.children.pop(node)#Update the general dictionary of children
                        
                #Eliminate its occurrence in the tree
                self.word.pop(node)                                
                self.dep.pop(node)
                self.rel.pop(node)                
                self.children[father].remove(node)
                if has_POS == 1:
                    self.pos.pop(node)
                if len(self.children[father]) == 0:#Update the status of a node to a leaf
                    self.children.pop(father)        
        for i in self.word.keys():#this cycle updates the list of leaves in the sentence
            if i not in self.children.keys():
                self.leaves.append(i)
    ######################


class OnlyStopW(object):
    '''
    This class obtains a reduced version of the original syntactic tree, keeping only stop words
    '''
    def __init__(self, obj, dictionary, has_POS):
        '''
        Constructor
        '''
        self.word          = copy.deepcopy(obj.word) #Dictionary of original words according to their positions
        self.pos           = copy.deepcopy(obj.pos) #Dictionarywith POS tags of words        
        self.dep           = copy.deepcopy(obj.dep) #Dictionary with the index of the father
        self.rel           = copy.deepcopy(obj.rel) #Dictionary with dependency relations 
        self.children      = copy.deepcopy(obj.children) #Dictionary with words that are dependent for a (key) word
        self.leaves        = [] #List of indexes of words that are leaves        
        self.root_idx      = obj.root_idx #Index of the root

        self.word[self.root_idx] = "null"        
        self.rel[self.root_idx] = "null"
        self.drop_nodes(dictionary, obj, has_POS)
    ########################################################################

    def drop_nodes(self, dictionary, obj, has_POS):                
        nodes = copy.deepcopy(obj.word.keys()) #List of indexes of the sentece elements without the root
        nodes.remove(obj.root_idx)        
        for node in nodes:
            if self.word[node].lower() not in dictionary:#Case where we have to remove the node
                father = self.dep[node]                
                relation = self.rel[node]
                if self.children.has_key(node):                
                    for child in self.children[node]:
                        self.children[father].append(child)#Update the relations between nodes
                        self.rel[child] = relation + ";"+ self.rel[child]
                        self.dep[child] = father
                    self.children.pop(node)#Update the general dictionary of children
                        
                #Eliminate its appearence in the tree
                self.word.pop(node)                                
                self.dep.pop(node)
                self.rel.pop(node)                
                self.children[father].remove(node)
                if has_POS == 1:                    
                    self.pos.pop(node)
                if len(self.children[father]) == 0:#Update the status of a node to a leaf
                    self.children.pop(father)
                            
        for i in self.word.keys():#this cycle updates the list of leaves in the sentence
            if i not in self.children.keys():
                self.leaves.append(i)
    ######################
            
                                                                          
class DepInfo(object):
    '''
    This class represents the dependency information of a sentence
    ''' 
    def __init__(self, lines, has_POS):
        '''
        Constructor
        '''
        self.word          = {} #Dictionary of original words according to their positions
        self.pos           = {} #Dictionary with POS tags of words        
        self.rel           = {} #Dictionary with dependency relations 
        self.dep           = {} #Dictionary with the index of the father
        self.children      = {} #Dictionary with words that are dependent for a (key) word        
        self.leaves        = [] #List of indexes of words that are leaves        
        self.root_idx      = -1 #Index of the root
        self.prepare_indices(lines,has_POS)
        
    ######################
    def prepare_indices(self, lines,has_POS):        
        if has_POS == 1:
            postags = lines.pop(0)#Obtain the information about POS tags from the list lines                
        for line in lines:
            idx = line[line.rfind("-")+1:]
            idx = idx.rstrip(")")
            idx = int(idx)
            self.rel[idx] = line[ : line.find("(")]
            line=line[line.find("("):]
            #Obtain the index of the dependecy of the father index
            auxline=line[line.find("(")+1:line.find(", ",line.find("-"))]                        
            p_idx = int(auxline [auxline.rfind("-") + 1 :] )
            self.dep [idx]  = p_idx
            self.word[idx]  = line[line.rfind(", ") + 2 : line.rfind("-")]
      
            if self.word[idx] == ',':
                self.word[idx] = "\,"
            elif self.word[idx] == '[':
                self.word[idx] = "\["
            elif self.word[idx] == ']':
                self.word[idx] = "\]"
            elif self.word[idx] == '\\':
                self.word[idx] = "\\\\"
            ###Case added to handle number that include colons (thousands)
            if self.word[idx].find(",")>-1:
                self.word[idx] = self.word[idx].replace(",","\,")
				
            self.children[p_idx] = self.children.get(p_idx, [])
            self.children[p_idx].append(idx)

            if self.dep [idx] == 0:
                self.root_idx = idx
                self.rel[idx] = "root"#Line added because of the FREELING output
      
        #Determine if a word is a leaf
        for i in self.word.keys():
            if i not in self.children.keys():
                self.leaves.append(i)
        
        if has_POS == 1:
            ####Next section stores the information of POS tags
            postags = postags.split(" ")
            aa = self.word.keys()
            aa.sort()
            for idx in aa:
                self.pos[idx] = postags[idx-1].split("/")[1]                
######################                                            


class BiSNgrams(object):
    '''
    classdocs
    '''
    def __init__(self, min_size, max_size, max_num_children, option):
        '''
        Constructor
        '''
        self.min_size         = min_size         #The minimum size for the sn-grams
        self.max_size         = max_size         #The maximum size for the sn-grams
        self.max_num_children = max_num_children #The maximum number of children per node
         
        if option in range(0,8):
            self.option         = option #Type of sn-grams to be obtained: 0 for WORD sn-grams; 1 for sn-grams of SR Tags ; 2 for both
        else: 
            print "Error: Invalid value for the parameter option"
            exit(1)
            
        self.subtrees       = [] #List that contains all the nodes that are not leaves
        self.DepNgrams      = []
        self.log            = [] #List that contains the nodes that have more children than the parameter max_num_children                
        self.dicPOSTags     = []
        self.dicSRTags      = []
        self.dicWordNgrams  = []
        self.dicWordSR      = []
        self.dicWordPOS     = []
        self.dicSRWord      = []
        self.dicSRPOS       = []
        self.dicPOSWord     = []
        self.dicPOSSR       = []
        
        for i in xrange(min_size, max_size+1):
            self.dicPOSTags.append({})
            self.dicSRTags.append({})
            self.dicWordNgrams.append({})
    
    def reset_vars (self):
        del self.subtrees[:]
        del self.DepNgrams[:] 
        del self.log[:]        
        del self.dicPOSTags[:]
        del self.dicSRTags[:]
        del self.dicWordNgrams[:] 
        del self.dicWordSR[:]
        del self.dicWordPOS[:]
        del self.dicSRWord[:]
        del self.dicSRPOS[:]
        del self.dicPOSWord[:]
        del self.dicPOSSR[:]       
        for i in xrange(self.min_size, self.max_size+1):
            self.dicPOSTags.append({})
            self.dicSRTags.append({})
            self.dicWordNgrams.append({}) 
            self.dicWordSR.append({})
            self.dicWordPOS.append({})
            self.dicSRWord.append({})
            self.dicSRPOS.append({})
            self.dicPOSWord.append({})
            self.dicPOSSR.append({})                          
    #########################################
    
    
    def print_parsed_sentence(self, sentence, has_POS):
        line = "*****Sentence: "
        for i in sorted(sentence.word.keys()):
            line += sentence.word[i]+" "
        print line.rstrip(" ")
        
        #########'''
        line = ""
        for i in sentence.word.keys():
            line  =     str(i)         + "\t"
            line +=     sentence.word[i]  + "\t"
            if has_POS == 1:
                line +=     sentence.pos [i]  + "\t"
            line +=     sentence.rel [i]  + "\t"
            line += str(sentence.dep [i]) + "\t"
            if i not in sentence.leaves:
                line += str(sentence.children[i])
            print line

        print "Leaf nodes are:"
        line = ""
        for i in sentence.leaves:
            line += sentence.word[i] + ", "
        print line
        #########'''
    
    def write_all_sn_grams (self, f2,sent_num):
        '''
        This method write in a file one kind of sn-gram according with the value of op
        '''        
        if self.option == 0:#Acording with the params, the sngrams are stored in the container                                
            d = self.dicWordNgrams                                        
            f2.write("Sentence "+sent_num+" ************sn-grams of words:\n")        
        elif self.option == 1:                
            d = self.dicSRTags                
            f2.write("Sentence "+sent_num+" ************sn-grams of tags of syntactic relations (SR tags):\n")
        elif self.option == 2:
            d = self.dicPOSTags                 
            f2.write("Sentence "+sent_num+" ************sn-grams of POS tags:\n")
        elif self.option == 3:
            d = self.dicWordSR
            f2.write("Sentence "+sent_num+" ************sn-grams Word-SR:\n")                    
        elif self.option == 4:
            d = self.dicWordPOS
            f2.write("Sentence "+sent_num+" ************sn-grams Word-POS:\n")                    
        elif self.option == 5:
            d = self.dicSRWord                    
            f2.write("Sentence "+sent_num+" ************sn-grams SR-Word:\n")
        elif self.option == 6:
            d = self.dicSRPOS                    
            f2.write("Sentence "+sent_num+" ************sn-grams SR-POS:\n")
        elif self.option == 7:
            d = self.dicPOSWord                    
            f2.write("Sentence "+sent_num+" ************sn-grams POS-Word:\n")
        elif self.option == 8:
            d = self.dicPOSSR 
            f2.write("Sentence "+sent_num+" ************sn-grams POS-SR:\n")
        
        for idx, dic in enumerate(d):
            f2.write ("\n************Size: " + str(idx + self.min_size) + "\n")
            if len(dic.keys()) > 0:
                for item in dic.keys():
                    f2.write (item + "\t" + str(dic[item]) + "\n")
            else:
                f2.write("EMPTY\n")
        f2.write("\n")        
        #if self.option == 0 or self.option == 3:
        #    self.write_WordSngrams(f2,sent_num)
        #if self.option == 1 or self.option == 3:
        #    self.write_SRSngrams(f2,sent_num)
        #if self.option == 2 or self.option == 3:
        #    self.write_POSSngrams(f2,sent_num)
    ################################
    
    '''
    def write_WordSngrams(self,f2,sent_num):
        f2.write("Sentence "+sent_num+" ************sn-grams of words:\n")
        for idx, dic in enumerate(self.dicWordNgrams):
            f2.write ("\n************Size: " + str(idx + self.min_size) + "\n")
            if len(dic.keys()) > 0:
                for item in dic.keys():
                    f2.write (item + "\t" + str(dic[item]) + "\n")
            else:
                f2.write("EMPTY\n")
        f2.write("\n")
        
    def write_SRSngrams(self,f2,sent_num):
        f2.write("Sentence "+sent_num+" ************sn-grams of tags of syntactic relations (SR tags):\n")
        for idx, dic in enumerate(result.dicSRTags):
            f2.write ("\n************Size: " + str(idx + self.min_size) + "\n")
            if len(dic.keys()) > 0:
                for item in dic.keys():
                    f2.write (item + "\t" + str(dic[item]) + "\n")
            else:
                f2.write("EMPTY\n")
        f2.write("\n")
      
    def write_POSSngrams(self,f2,sent_num):
        f2.write("Sentence "+sent_num+" ************sn-grams of POS tags:\n")
        for idx, dic in enumerate(self.dicPOSTags):
            f2.write ("\n************Size: " + str(idx + self.min_size) + "\n")
            if len(dic.keys()) > 0:
                for item in dic.keys():
                    f2.write (item + "\t" + str(dic[item]) + "\n")
            else:
                f2.write("EMPTY\n")
        f2.write("\n")
    '''
        
    def print_sngrams(self):
        
        if self.option == 0:#Acording with the params, the sngrams are stored in the container                                
            d = self.dicWordNgrams                                        
            print "************sn-grams of words:"        
        elif self.option == 1:                
            d = self.dicSRTags                
            print "************sn-grams of tags of syntactic relations (SR tags):"
        elif self.option == 2:
            d = self.dicPOSTags                 
            print "************sn-grams of POS tags:"
        elif self.option == 3:
            d = self.dicWordSR
            print "************sn-grams Word-SR:"                    
        elif self.option == 4:
            d = self.dicWordPOS
            print "************sn-grams Word-POS:"                    
        elif self.option == 5:
            d = self.dicSRWord                    
            print "************sn-grams SR-Word:"
        elif self.option == 6:
            d = self.dicSRPOS                    
            print "************sn-grams SR-POS:"
        elif self.option == 7:
            d = self.dicPOSWord                    
            print "************sn-grams POS-Word:"
        elif self.option == 8:
            d = self.dicPOSSR 
            print "************sn-grams POS-SR:"
        
        for idx, dic in enumerate(d):
            print "\n************Size: " + str(idx + self.min_size)
            if len(dic.keys()) > 0:
                for item in dic.keys():
                    print  item + "\t"+str(dic[item])
            else:
                print "EMPTY"
            print "****************************************"        
    ###########################

        
    def process_sentence (self, lines, option2, dictionary,has_POS):
        '''
        This method calls the specific methods (general steps) for producing sn-grams according to the parameter "option"
        '''
        self.reset_vars()
        sentence = DepInfo(lines,has_POS)

        self.print_parsed_sentence(sentence,has_POS)
        #print "Valor de la opcion 2 "+str(option2)
        if option2 == 1:#In this case we reduce the 
            sentence = ReduceDepInfo(sentence, dictionary, has_POS)
            self.print_parsed_sentence(sentence)
        elif option2 == 2:
            sentence = OnlyStopW(sentence, dictionary, has_POS)
            self.print_parsed_sentence(sentence)
                            
        for i in sentence.word.keys():            
            if i in sentence.children.keys():
                self.subtrees.append(i)#Store all the possible roots of the subtrees                
                             
        if self.option in range(0,9):
            if self.min_size >= 0:
                if self.max_size >= self.min_size:
                    if self.min_size <= len(sentence.word): 
                        
                        if self.max_size > len(sentence.word):
                            line = "\tMessage: The value of the maximum size exceeds the length of the sentence.\n"
                            #print line                            
                        if self.min_size == 0 or self.max_size == 0:
                            line = "\tMessage: the program will obtain the sn-grams of all possible sizes.\n"
                            #print line                            
                        
                        log = self.get_all_DepNgrams(sentence)
                                                
                        if len(log) > 0:
                            line = "\tThe next words have more than "+ str(self.max_num_children) +" children:\n"
                            print line
                            
                            for item in log:
                                line = "\t\t"+sentence.word[item]+"\n"
                                print line
                                                                                                
                        if self.option in range(0,9):
                            self.store_all_DepNgrams(sentence, self.option)                            
                        #else:
                        #    self.store_all_DepNgrams(sentence, 0)
                        #    self.store_all_DepNgrams(sentence, 1)
                        #    self.store_all_DepNgrams(sentence, 2)
                        
                        #self.show_sngrams(sentence, log) #This method only shows the sn-grams obtained from the sentence                    
                    else:                        
                        line = "\tERROR: The value of the minimum size exceeds the length of the sentence\n"
                        # print line
                else:                    
                    line = "\tERROR: The maximum size must be greater than the minimum size\n"
                    print line
            else:                
                line = "\tERROR: The value of the minimum size is not allowed\n"
                print line
        else:            
            line = "\tERROR: Invalid value for the parameter option\n"
            print line
        

    def prepare_SNgram(self, line, sentence, op):
        '''        
        op = -1   for sngrams of index
        op = 0    for sngrams of words
        op = 1    for sngrams of sr tags
        op = 2    for sngrams of POS tags        
        op = 3    for Word/SR SNgrams
        op = 4    for Word/POS SNgrams
        op = 5    for SR/Word SNgrams
        op = 6    for SR/POS SNgrams
        op = 7    for POS/Word SNgrams
        op = 8    for POS/SR SNgrams
        '''
        ngram = ""
        for item in line:
            if type(item) is str:
                ngram += item
            elif type(item) is int:
                if op == -1:
                    ngram += str(item)
                elif op == 0:
                    ngram += sentence.word[item]
                elif op == 1:
                    ngram += sentence.rel[item]
                elif op == 2:
                    ngram += sentence.pos[item]                 
                elif op == 3:
                    ngram += sentence.word[item]
                    op = 1
                elif op == 4:
                    ngram += sentence.word[item]
                    op = 2
                elif op == 5:
                    ngram += sentence.rel[item]
                    op = 0
                elif op == 6:
                    ngram += sentence.rel[item]
                    op = 2
                elif op == 7:
                    ngram += sentence.pos[item]
                    op = 0
                elif op == 8:
                    ngram += sentence.pos[item]
                    op = 1                               
            else:
                ngram += self.prepare_SNgram(item, sentence, op)
        return ngram
    ######################
    
                   
    def is_continuous(self, ngram):
        '''
        This method tests if a sn-gram is continuous or not. It assumes that no punctuation characters are allowed in the sn-gram. Used for testing.
        '''
        answer = ""
        if ngram.count(",") > 0:
            answer = "NO"
        else:
            answer = "YES"                    
        return answer    
    ######################
    
                  
    def len_Ngram(self, ngram):
        n = 1
        n += ngram.count("[")
        n += ngram.count(",")
        n -= ngram.count("\[")
        n -= ngram.count("\,")
        return n  
    ########################
    
    
    def get_all_DepNgrams(self, sentence):
        '''
        This method begins the process of getting all the sn-grams of the dependency tree
        '''                         
        unigrams      = []  #Auxiliar variable that contains all the unigrams
        combinations  = []  #Auxiliar variable that contains all the combinations of a node with its children
        aux           = []
        log           = Set()
        
    
        if sentence.root_idx > 0:
            unigrams, combinations, log = self.get_subtrees (sentence)#Call this method first for obtaining all the posible subtrees                    
            
            if len(unigrams) > 0:
                self.DepNgrams.append([sentence.root_idx])
                self.DepNgrams.extend(unigrams)            #Adds the unigrams to the general container
            for item in combinations:                      #Adds the first sn-grams to the general container
                if self.min_size != 0 or self.max_size != 0:                        
                    size = self.len_Ngram(self.prepare_SNgram(item[0], sentence, -1))
                    if size >= self.min_size and size <= self.max_size:        #Check the size of the new sn-grams                
                        self.DepNgrams.append(copy.deepcopy(item[0]))                        
                    if size < self.max_size:
                        aux.append(item)
                else:
                    self.DepNgrams.append(copy.deepcopy(item[0]))

            if self.min_size != 0 or self.max_size != 0:
                self.compound_sngrams(aux, sentence)   #This function generates the rest of sn-grams
            else:
                self.compound_sngrams(combinations, sentence)
        else:
            line = "\tError, no root found\n"
            print line        
        return(log)    
    ######################        
    
    
    def store_all_DepNgrams(self, sentence, op):
        '''
        This method stores the sn-grams in the container specified by the parameter "op"
        '''         
        if op == 0:#Acording with the params, the sngrams are stored in the container                                
            d = self.dicWordNgrams                                                
        elif op == 1:                
            d = self.dicSRTags                
        elif op == 2:
            d = self.dicPOSTags                 
        elif op == 3:
            d = self.dicWordSR                    
        elif op == 4:
            d = self.dicWordPOS                    
        elif op == 5:
            d = self.dicSRWord                    
        elif op == 6:
            d = self.dicSRPOS                    
        elif op == 7:
            d = self.dicPOSWord                    
        elif op == 8:
            d = self.dicPOSSR
        
        for item in self.DepNgrams:
            ngram = self.prepare_SNgram (item, sentence, op)
            size = self.len_Ngram(ngram)               
            dic = d[size-self.min_size]                                                                                
            #Update the dictionary of SNgrams contained in the sample (frequency in the text)                         
            if dic.has_key(ngram) > 0:#Update the frequency of the ngram                
                dic[ngram] += 1 #If the sn-gram exists in the dictionary, update its frequency                 
            else:
                dic[ngram] = 1 #Otherwise, add the sn-gram to the dictionary                            
    ######################
    
        
    def compound_sngrams(self, original, sentence):
        combinations   = []
        candidates    = []    
        
        for combination in original:                     #This cycle initializes the list of combinations and list of candidates 
            if len(combination[1]) > 0:               
                size = self.len_Ngram(self.prepare_SNgram(combination[0], sentence, -1))
                combinations.append([combination[0],combination[1],size])
            if combination[0][0] != sentence.root_idx:
                size = self.len_Ngram(self.prepare_SNgram(combination[0], sentence, -1))       
                candidates.append([combination[0],combination[1],size])
                                      
        while len(candidates) > 0:                        #In this cycle, select a sn-gram to be replaced in the rest of combinations            
            candidate = candidates.pop(0)
            value = candidate[0][0]                       #Get the first number of the first candidate sn-gram
                              
            for combination in combinations:
                if value in combination[1]:                
                                        
                    position = combination[0].index(value,2)#First get the position of the element          
          
                    sngram = copy.deepcopy(combination)
                    sngram[0].pop(position)                 #Delete the element in the sn-gram
                    sngram[0].insert(position,candidate[0]) #Insert the new part into the sn-gram
                    sngram[1].remove(value)                 #Update its list of posible combinations
                    sngram[2] = self.len_Ngram(self.prepare_SNgram(sngram[0], sentence, -1))#Obtain the size of the new sngram                                        

                    if (self.min_size > 0) and (self.max_size > 0):            #Case when the user specifies the max and min size of sn-grams                        
                        if sngram[2] in xrange(self.min_size, self.max_size+1):#Case when the sn-grams from the list substitution have to be inserted                            
                            self.DepNgrams.append(copy.deepcopy(sngram[0]))    #Update the list of all sn-grams
                        if sngram[2] < self.max_size:                                                    
                            if sngram[0][0] == sentence.root_idx:
                                if len(sngram[1]) > 0:
                                    combinations.append(copy.deepcopy(sngram))
                            else:
                                if len(sngram[1]) > 0:
                                    combinations.append(copy.deepcopy(sngram))
                                candidates.append(copy.deepcopy(sngram))
                    else:                                                 #Case when there is no restriction on the size of the sn-grams
                        self.DepNgrams.append(copy.deepcopy(sngram[0]))   #Update the list of all sn-grams                                                    
                        if sngram[0][0] == sentence.root_idx:
                            if len(sngram[1]) > 0:
                                combinations.append(copy.deepcopy(sngram))
                        else:
                            if len(sngram[1]) > 0:
                                combinations.append(copy.deepcopy(sngram))
                            candidates.append(copy.deepcopy(sngram))
                                               
    
    ######################              
    def get_subtrees (self, sentence):   # A function that gets all the possible subtrees in the tree
        unigrams     = []    #List of all possible unigrams    
        combinations = []    #List of all possible combinations of nodes and their children
        counter      = 0     #Counts the number of children inserted in the aux list
        aux          = []    #Auxiliar variable that contains the highest number of children allowed
        log          = Set() #Variable that contains IDs of the nodes that have more children than it is allowed
      
        
        for node in self.subtrees:
            if self.max_num_children != 0:                      
                aux = []     #Reset the container for the next iteration
                counter = 0  #Reset the variable for the next iteration
                for child in sentence.children[node]:
                    
                    if self.min_size == 1 or self.min_size == 0 or self.max_size == 0: #This code obtains all unigrams of the sentence                
                        unigrams.append ([child])
                    
                    aux.append(child)
                    counter += 1
                    if counter > self.max_num_children:
                        aux.pop()
                        combinations.extend(self.get_next_combinations(node, aux, sentence))#We save new sn-grams in the global dictionary                    
                        counter = 0
                        aux = []
                        aux.append(child)
                        log.add(node)
                                                                
                if len(aux) > 0:                                                          #Analyze the rest of the children 
                    combinations.extend(self.get_next_combinations(node, aux, sentence))  #We save new sn-grams in the global dictionary
            
            else:            #In this case, there is no limitation on the number of children per node, so all the children are processed                
                combinations.extend(self.get_next_combinations(node, sentence.children[node], sentence))
                
                for child in sentence.children[node]:                
                    if self.min_size == 1 or self.min_size == 0 or self.max_size == 0:#This code obtains all unigrams of the sentence                
                        unigrams.append ([child])                                                                                                                                                        
                                                
        return (unigrams, combinations, log)
    
    
    ######################                  
    def get_next_combinations (self, value, children, sentence):
        ngram         = [] #Auxiliary variable for storing the sn-gram
        options       = [] #Auxiliary variable for storing the all the nodes that can be changed in a sn-gram
        combinations  = [] #Auxiliary variable for generating a combination
        lista         = [] #Auxiliary variable for all sn-grams during analysis of a sub-tree
            
        #Initialize the list of combinations    
        for p in xrange(0, len(children)):
            combinations.append (0)
      
        #Generate sn-grams    
        for r in xrange (1, len(children) + 1):                 
            for j in xrange (1, r + 1):
                combinations [j - 1] = j - 1

        #################### The first combination
            options = []
            ngram   = []
            ngram.append (value)
            ngram.append ("[")
            for z in xrange (0, r):
                ngram.append(children [combinations [z]])

                if children[combinations[z]] not in sentence.leaves:
                    options.append(children [combinations [z]])

                ngram.append (",")
            ngram.pop (len(ngram) - 1)          
            ngram.append ("]")            
            lista.append (copy.deepcopy([ngram,options]))

            ################### The rest
            top = self.Combination (len(children), r)
      
            for j in xrange(2, top + 1):
                m = r
                val_max = len(children)

                while combinations [m - 1] + 1 == val_max:
                    m       -= 1
                    val_max -= 1

                combinations [m - 1] += 1

                for k in xrange (m + 1, r + 1):
                    combinations [k - 1] = combinations [k - 2] + 1
            
                options = []
                ngram   = []
                ngram.append(value)
                ngram.append("[")                
                for z in xrange(0, r):
                    ngram.append (children [combinations [z]])

                    if children[combinations[z]] not in sentence.leaves:
                        options.append(children [combinations [z]])

                    ngram.append (",")
                ngram.pop (len(ngram) - 1)
                ngram.append ("]")
                lista.append (copy.deepcopy([ngram,options]))
              
        return (lista)          
    
    ######################                  
    def Combination (self, sz, r):
        if sz == r:
            numerator = 1
        else:
            numerator = sz
            for i in xrange (1, sz):
                numerator *= sz - i
        
            aux = r
            for i in xrange (1, r):
                aux *= r - i
        
            divisor = sz - r
            for i in xrange (1, sz - r):
                divisor *= sz - r - i
                
            numerator = numerator / (aux * divisor)
      
        return (numerator)

############
def process_one_sentence (lines, result, sent_num, f2, option2, dictionary, has_POS):
    print "Sentence " + str(sent_num)
    result.process_sentence (lines, option2, dictionary,has_POS)
    #result.print_sngrams()
    result.write_all_sn_grams (f2,str(sent_num))
    
    return sent_num + 1 

############### MAIN ################################
if __name__ == '__main__':
        
    encod = 'utf-8'   #'utf-8' or other encoding like '1252'
    dictionary = []    #variable that contains the stop words
    
    #Cases:
    #python MultiSNGrams_3.py input output
	#Note: Default values are min_size = 2; max_size = 7; max_num_children = 5, option = 0 (word sn-grams), option2 = -1 (not prune)    
    #python MultiSNGrams_3.py input output dictionary
	#Note: Default values are min_size = 2; max_size = 7; max_num_children = 5, option = 0 (word sn-grams), option2 = -1 (not prune)    
	#python MultiSNGrams_3.py input output min_size max_size max_num_children option
    #python MultiSNGrams_3.py input output min_size max_size max_num_children option option2 dictionary
    
    if len(sys.argv) < 3:
        print "Usage with at least two parameters:"
        print "python SNGrams3.py input output"
        exit(1)
    elif len(sys.argv) > 9:
        print "Usage with at most eight parameters:"
        print "python SNGrams3.py input output dictionary min_size max_size max_num_children option"
        exit(1)
    elif len(sys.argv) not in [3,4,7,9]:
        print "Mising parameters"
        exit(1)        

    input_file      = sys.argv[1]
    output_file     = sys.argv[2]

    #############These are parameters of configuration for the class BiSNgrams    
    min_size         = 2    #These are the parameters of configuration for the class BiSNgrams 
    max_size         = 7
    max_num_children = 5
    option           = 0    #Type of sn-grams to be obtained: 0 for WORD sn-grams; 1 for sn-grams of SR Tags; 2 for POS tags; 3 for all types of sngrams        
                            #option = 0    for sngrams of words
                            #option = 1    for sngrams of SR tags
                            #option = 2    for sngrams of POS tags        
                            #option = 3    for Word/SR SNgrams
                            #option = 4    for Word/POS SNgrams
                            #option = 5    for SR/Word SNgrams
                            #option = 6    for SR/POS SNgrams
                            #option = 7    for POS/Word SNgrams
                            #option = 8    for POS/SR SNgrams
    valid_options    = []   #List of valid options according with the format of input text
    has_POS          = 1    #Value of 1 indicates the input contains POS tags otherwise value of 0. Default value is 1    
    option2          = -1   #Valid only for prune trees: 1 for no stopwords; 2 for only stopwords
    
                            
    if len(sys.argv) == 4:                
        dictionary_file = sys.argv[3]
        option2 = 1
        try:
            print dictionary_file 
            f3 = codecs.open (dictionary_file, "rU", encoding = encod)  #b - Binary, for Unix line endings
            for item in f3.readlines():
                item = item.rstrip()                                
                dictionary.append(item)
                                    
            f3.close()
            if len(dictionary) == 0:
                print "ERROR: Empty dictionary"
                exit(1)
                
        except IOError as e:
            print dictionary_file + "I/O error({0}): {1}".format(e.errno, e.strerror)
            exit(1)

    if len(sys.argv) == 7:
        min_size         = int(sys.argv[3])    #These are the parameters of configuration for the class BiSNgrams 
        max_size         = int(sys.argv[4])
        max_num_children = int(sys.argv[5])
        option           = int(sys.argv[6])    #value 0: for sn-grams of words; 
                                               #value 1: for sn-grams of sr tags; 
                                               #value 2: for sn-grams of words and sr tags (equal to call with option 0 and then with option 1)
            
    if len(sys.argv) == 9:
        min_size         = int(sys.argv[3])    #These are the parameters of configuration for the class BiSNgrams 
        max_size         = int(sys.argv[4])
        max_num_children = int(sys.argv[5])
        option           = int(sys.argv[6])    
        option2          = int(sys.argv[7])    #value for the kind of prune of the tree
        dictionary_file  = str(sys.argv[8])    #path of the stopwords dictionary
        try:
            print dictionary_file 
            f3 = codecs.open (dictionary_file, "rU", encoding = encod)  #b - Binary, for Unix line endings
            for item in f3.readlines():
                item = item.rstrip()                                
                dictionary.append(item)
                                    
            f3.close()
            if len(dictionary) == 0:
                print "ERROR: Empty dictionary"
                exit(1)
                
        except IOError as e:
            print dictionary_file + "I/O error({0}): {1}".format(e.errno, e.strerror)
            exit(1)
                                                                           
    try:
        f1 = codecs.open (input_file,  "rU", encoding = encod)
        #**Read the input file and identify the format (includes POS tags or not)
        first_ln = f1.readlines()[0]
        #print first_ln        
        m = re.search('-[0-9]*\)', first_ln)
        #patron = re.compile(r'*[0-200])$')        
        if m:#Case where there are NOT POS tags
            #print "No POS"
            valid_options    = [0,1,3,5]
            has_POS = 0
            if option not in valid_options:
                print "ERROR: The selected option requieres POS tags but the input file does not contain POS tags"
                print "Select one of the following options"
                print "option = 0    for sngrams of words"
                print "option = 1    for sngrams of SR tags"                                    
                print "option = 3    for Word/SR SNgrams"                
                print "option = 5    for SR/Word SNgrams"                            
                exit(1)                            
        #else:#Case where there are POS tags
            #print "Si POS"        
        f1.close()
        f1 = codecs.open (input_file,  "rU", encoding = encod)
    except IOError as e:
        print input_file + "I/O error({0}): {1}".format(e.errno, e.strerror)
        exit(1)
        
    try:
        f2 = codecs.open (output_file, "wb", encoding = encod)  #b - Binary, for Unix line endings
    except IOError as e:
        print output_file + "I/O error({0}): {1}".format(e.errno, e.strerror)
        exit(1)
    

    sent_num = 1;
    result = BiSNgrams(min_size, max_size, max_num_children, option)
    lines  = []
    
    ###########Process the input file        
    if has_POS == 1: #Case where the POS tags are included in the input file
        print "Case with POS tags"
        flag = 0 #Auxiliar variable that helps to parse the text    
        for ln in f1.readlines():            
            #if (not ln) or (ln == ""):
            #    break;
            ln = ln.strip()
            if ln == "":#Var flag counts the empty lines in the text
                flag+=1
            else:
                lines.append(ln)                    
            if flag == 2:                
                if len (lines) > 0:
                    sent_num = process_one_sentence (lines, result, sent_num, f2, option2, dictionary, has_POS)                
                    del lines [:]
                    flag = 0        
        ######################
    else: #Case where there are NO POS tags in the input file
        print "Case with no POS tags detected"
        while True :
            ln = f1.readline ()
            #print ln
            if (not ln) or (ln == ""):
                break;
            
            ln = ln.strip()
            if ln == "":   #Sentences are separated by EMPTY line
                if len (lines) > min_size:
                    sent_num = process_one_sentence (lines, result, sent_num, f2, option2, dictionary, has_POS)
                    del lines [:]
            else:
                lines.append (ln)        
    #########################
    #print lines
    if len(lines) > 0: #Last piece in previous (while)
        sent_num = process_one_sentence (lines, result, sent_num, f2, option2, dictionary, has_POS)
            
    f1.close ()
    f2.close ()
           
    print "Done."