diff scripts/pogs.py @ 0:7bee3426a442 draft default tip

planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
author abims-sbr
date Fri, 01 Feb 2019 10:24:29 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/pogs.py	Fri Feb 01 10:24:29 2019 -0500
@@ -0,0 +1,283 @@
+#!/usr/bin/env python
+# coding: utf8
+# September 2017 - Author : Victor Mataigne (Station Biologique de Roscoff - ABiMS)
+# Command line : ./pogsPOO.py <list_of_input_files_separated_by_commas> <minimal number of species per group> [-v) [-p]
+
+"""
+What it does:
+    - pogs.py parses output files from the "pairwise" tool of the AdaptSearch suite and proceeds to gather genes in orthogroups (using transitivity).
+    - A minimal number of species per group has to be set.
+
+BETA VERSION
+"""
+
+import os, argparse, itertools
+import numpy as np
+import pandas as pd
+
+""" Definition of a locus : header + sequence + a tag """
+class Locus:    
+
+    def __init__(self, header, sequence):
+        self.header = header
+        self.sequence = sequence
+        self.tagged = False
+
+    def __str__(self):
+        return "{}{}".format(self.header, self.sequence)
+
+    def __eq__(self, other):
+        return self.getHeader() == other.getHeader()  # Test if two loci are the same
+
+    def __hash__(self):
+        return hash((self.header, self.sequence))  # Make the object iterable and hashable
+
+    def getHeader(self):
+        return self.header
+
+    def getSequence(self):
+        return self.sequence
+
+    def getTag(self):
+        return self.tagged
+
+    def prettyPrint(self): 
+        # Used for debugging : print "{ Header : ", self.header[0:-1], "Tag : ", self.tagged, " }"
+        print "[ Header : {header} ]".format(header=self.header[0:-1])
+
+    def prettyPrint2(self):
+        print "[ Header : {header} Sequence : {sequence} ]".format(header=self.header[0:-1], sequence=self.sequence[0:-1])
+
+""" Applies the getPairwiseCouple() function to a list of files and return a big list with ALL pairwises couples 
+    Returns a list of sets (2 items per set) """
+def getListPairwiseAll(listPairwiseFiles):    
+    
+    # Sub-Function
+
+    """ Reads an output file from the 'Pairwise' tool (AdaptSearch suite) and returns its content into a list 
+        Returns a list of sets (2 items per set) """
+    def getPairwiseCouple(pairwiseFile):        
+        list_pairwises_2sp = []
+        with open(pairwiseFile, "r") as file:
+            for name, sequence, name2, sequence2 in itertools.izip_longest(*[file]*4):            
+                # One locus every two lines (one pairwise couple = 4 lines) : header + sequence
+                locus1 = Locus(name, sequence)
+                locus2 = Locus(name2, sequence2)
+                group = set([])
+                group.add(locus1)
+                group.add(locus2)
+                list_pairwises_2sp.append(group)
+        return (list_pairwises_2sp)
+
+    # Function
+    list_pairwises_allsp = []
+    for file in listPairwiseFiles:
+        listPairwises = getPairwiseCouple(file)
+        for pairwise in listPairwises:
+            list_pairwises_allsp.append(pairwise)  # all pairwises in the same 1D list
+    return list_pairwises_allsp
+
+""" Proceeds to create orthogroups by putting together pairwise couples sharing a locus.
+    Iterates over the orthogroups list and tag to 'True' the pairwise couple already gathered in a group to avoid
+    redondancy. Writes each orthogroup in a fasta file
+    Returns an integer (a list length) """
+def makeOrthogroups(list_pairwises_allsp, minspec, nb_rbh, verbose, paralogs):
+
+    # Sub-funtions
+
+    """ Check if a locus/group has already been treated in makeOrthogroups()
+        Returns a boolean """
+    def checkIfTagged(pair):
+        tag = True
+        for element in pair:
+            if not element.getTag() and tag: # use a list comprehension maybe ?
+                tag = False
+        return tag
+
+    """ True means a locus/group has already been treated in makeOrthogroups()
+        A stronger code would be to implement a method inside the class Locus """
+    def tagGroup(pair):
+        for element in pair:
+            element.tagged = True
+
+    """ Write an orthogroup in a file """
+    def writeOutputFile(orthogroup, number, naming):
+        name = ""
+        if naming:
+            name = "orthogroup_{}_with_{}_sequences_withParalogs.fasta".format(number, len(orthogroup))
+        else :
+            name = "orthogroup_{}_with_{}_sequences.fasta".format(number, len(orthogroup))
+        result = open(name, "w")
+        with result:
+            for locus in orthogroup:
+                if locus.getHeader()[-1] == "\n":
+                    result.write("%s" % locus.getHeader())  # write geneID
+                else :
+                    result.write("%s\n" % locus.Header())  # write geneID
+                if locus.getSequence()[-1] == "\n":
+                    result.write("%s" % locus.getSequence())  # write sequence
+                else :
+                    result.write("%s\n" % locus.getSequence())  # write sequence
+        if naming:
+            os.system("mv {} outputs_withParalogs/".format(name))
+        else :
+            os.system("mv {} outputs/".format(name))
+
+    """ Parse an orthogroup list to keep only one paralog sequence per species & per group
+        (Keeps the 1st paralogous encoutered)
+        Returns a list """
+    def filterParalogs(list_orthogroups, minspec):
+        list_orthogroups_format = []
+        j = 1
+
+        for nofilter_group in list_orthogroups:
+            new_group = []
+            species = {}
+            for loci in nofilter_group:
+                species[loci.getHeader()[1:3]] = False
+            for loci in nofilter_group:
+                if not species[loci.getHeader()[1:3]]:
+                    new_group.append(loci)
+                    species[loci.getHeader()[1:3]] = True
+
+            if len(new_group) >= minspec: # Drop too small orthogroups        
+                list_orthogroups_format.append(new_group)
+                writeOutputFile(new_group, j, False)
+                j += 1
+                
+        return list_orthogroups_format
+
+    """ Builds a 2D array for a summary
+        Returns a numpy 2D array """
+    def countings(listOrthogroups, nb_rbh):
+
+        def compute_nbspec(nb_rbh):
+    
+            def factorielle(x):
+                n = 1
+                s = 0
+                while n <= x:
+                    s += n
+                    n += 1
+                return s
+            
+            x = 2    
+            nb_specs = 0
+            while x*x - factorielle(x) < nb_rbh:
+                x += 1 
+            return x
+        #listOrthogroups.sort().reverse()
+        #nblines = len(listOrthogroups[0])
+        nblines = 0    
+        for group in listOrthogroups:
+            if len(group) > nblines:
+                nblines = len(group)
+        matrix = np.array([[0]*compute_nbspec(nb_rbh)]*nblines)        
+
+        for group in listOrthogroups:
+            listSpecs = []
+            for loci in group:
+                if loci.getHeader()[1:3] not in listSpecs:
+                    listSpecs.append(loci.getHeader()[1:3])
+            matrix[len(group)-1][len(listSpecs)-1] += 1
+
+        return matrix
+
+    """ numpy 2D array in a nice dataframe 
+        Returns a pandas 2D dataframe """
+    def asFrame(matrix) :
+        index = [0]*len(matrix)
+        colnames = [0]*len(matrix[0])
+        index = [str(i+1)+" seqs" for i in range(len(matrix))]
+        colnames = [str(i+1)+" sps" for i in range(len(matrix[0]))]
+        df = pd.DataFrame(matrix, index=index, columns=colnames)
+        return df # Mettre une selection pour ne renvoyer que les lignes et les colonnes qui somment > 0
+        #return df.loc['4 seqs':'9 seqs'].loc[:,colnames[3:]]
+
+    # Function -------------------------------------------------------------------------------------------------
+    list_orthogroups = []
+
+    for ortho_pair1 in list_pairwises_allsp[0:-1]:
+        if not checkIfTagged(ortho_pair1):
+            orthogroup = ortho_pair1 # the orthogroup grows as we go throught the second loop
+
+            # check for common locus between two groups
+            for ortho_pair2 in list_pairwises_allsp[list_pairwises_allsp.index(ortho_pair1) + 1:]:
+                if len(orthogroup.intersection(ortho_pair2)) != 0 and not checkIfTagged(ortho_pair2):
+                    orthogroup.update(orthogroup | ortho_pair2)
+                    tagGroup(ortho_pair2)
+
+            # Check if subgroup is already computed
+            if len(list_orthogroups) > 0:
+                presence = False
+                for group in list_orthogroups:
+                    if len(group.intersection(orthogroup)) != 0:
+                        group.update(group | orthogroup)
+                        presence = True
+                if not presence:
+                    list_orthogroups.append(orthogroup)
+            else:
+                list_orthogroups.append(orthogroup)
+    
+    # Options --------------------------------------------------------------------------------------------------
+
+    """ nb : I could try to implement a more complex code which does in the same previous loop all the following lines, to avoid multiples parsing of
+        the orthogroups list, but the code would become hardly readable. Since the whole program is already quite fast, I chosed code simplicity
+        over code efficiency """
+
+    # Print summary table with all paralogs
+    if verbose :
+        frame = countings(list_orthogroups, nb_rbh)
+        df = asFrame(frame)
+        print "\n    Summary before paralogous filtering : \n"
+        print df.loc[df.ne(0).any(1),df.ne(0).any()], "\n" # Don't display columns and lines filled with 0
+
+    # Write outputFile with all the paralogous
+    if paralogs:
+        print "Writing orthogroups with paralogs files ...\n"
+        j = 1
+        for group in list_orthogroups:
+            if len(group) >= minspec:
+                writeOutputFile(group, j, True)
+                j += 1    
+
+    # Paralogs filtering and summary ----------------------------------------------------------------------------
+
+    print "Filtering paralogous sequences and writing final orthogroups files ..."
+    print "    (Dropping Orthogroups with less than {} species)".format(minspec)
+
+    # writeOutputFile() is called in filterParalogs()    
+    list_orthogroups_format = filterParalogs(list_orthogroups, minspec)   
+
+    frame = countings(list_orthogroups_format, nb_rbh)    
+    df = asFrame(frame)
+    print "\n    Summary after paralogous filtering : \n"
+    print df.loc[df.ne(0).any(1),df.ne(0).any()]
+
+    #return only the length of the list (at this point the program doesn't need more)
+    return len(list_orthogroups_format)
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("files", help="Input files separated by commas. Each file contains all the reciprocical best hits between a pair of species")
+    parser.add_argument("minspec", help="Only keep Orthogroups with at least this number of species", type=int)    
+    parser.add_argument("-v", "--verbose", action="store_true", help="A supplemental summary table of orthogroups before paralogs filtering will be returned")
+    parser.add_argument("-p", "--paralogs", action="store_true", help="Proceeds to write orthogroups also before paralogous filtering")
+    args = parser.parse_args()
+
+    print "*** pogs.py ***"
+    print "\nBuilding of orthogroups based on pairs of genes obtained by pairwise comparisons between pairs of species."
+    print "Genes are gathered in orthogroups based on the principle of transitivity between genes pairs."    
+
+    os.system("mkdir outputs")
+    if args.paralogs: os.system("mkdir outputs_withParalogs")
+    infiles = args.files
+    listPairwiseFiles = str.split(infiles, ",")
+    print "\nParsing input files ..."
+    list_Locus = getListPairwiseAll(listPairwiseFiles)
+    print "Creating Orthogroups ..."
+    nb_orthogroups = makeOrthogroups(list_Locus, args.minspec, len(listPairwiseFiles), args.verbose, args.paralogs)
+    print "\n{} orthogroups have been infered from {} pairwise comparisons by RBH\n".format(nb_orthogroups, len(listPairwiseFiles))
+
+if __name__ == "__main__":
+    main()