Mercurial > repos > abims-sbr > pogs
diff scripts/pogs.py @ 0:7bee3426a442 draft default tip
planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
author | abims-sbr |
---|---|
date | Fri, 01 Feb 2019 10:24:29 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/pogs.py Fri Feb 01 10:24:29 2019 -0500 @@ -0,0 +1,283 @@ +#!/usr/bin/env python +# coding: utf8 +# September 2017 - Author : Victor Mataigne (Station Biologique de Roscoff - ABiMS) +# Command line : ./pogsPOO.py <list_of_input_files_separated_by_commas> <minimal number of species per group> [-v) [-p] + +""" +What it does: + - pogs.py parses output files from the "pairwise" tool of the AdaptSearch suite and proceeds to gather genes in orthogroups (using transitivity). + - A minimal number of species per group has to be set. + +BETA VERSION +""" + +import os, argparse, itertools +import numpy as np +import pandas as pd + +""" Definition of a locus : header + sequence + a tag """ +class Locus: + + def __init__(self, header, sequence): + self.header = header + self.sequence = sequence + self.tagged = False + + def __str__(self): + return "{}{}".format(self.header, self.sequence) + + def __eq__(self, other): + return self.getHeader() == other.getHeader() # Test if two loci are the same + + def __hash__(self): + return hash((self.header, self.sequence)) # Make the object iterable and hashable + + def getHeader(self): + return self.header + + def getSequence(self): + return self.sequence + + def getTag(self): + return self.tagged + + def prettyPrint(self): + # Used for debugging : print "{ Header : ", self.header[0:-1], "Tag : ", self.tagged, " }" + print "[ Header : {header} ]".format(header=self.header[0:-1]) + + def prettyPrint2(self): + print "[ Header : {header} Sequence : {sequence} ]".format(header=self.header[0:-1], sequence=self.sequence[0:-1]) + +""" Applies the getPairwiseCouple() function to a list of files and return a big list with ALL pairwises couples + Returns a list of sets (2 items per set) """ +def getListPairwiseAll(listPairwiseFiles): + + # Sub-Function + + """ Reads an output file from the 'Pairwise' tool (AdaptSearch suite) and returns its content into a list + Returns a list of sets (2 items per set) """ + def getPairwiseCouple(pairwiseFile): + list_pairwises_2sp = [] + with open(pairwiseFile, "r") as file: + for name, sequence, name2, sequence2 in itertools.izip_longest(*[file]*4): + # One locus every two lines (one pairwise couple = 4 lines) : header + sequence + locus1 = Locus(name, sequence) + locus2 = Locus(name2, sequence2) + group = set([]) + group.add(locus1) + group.add(locus2) + list_pairwises_2sp.append(group) + return (list_pairwises_2sp) + + # Function + list_pairwises_allsp = [] + for file in listPairwiseFiles: + listPairwises = getPairwiseCouple(file) + for pairwise in listPairwises: + list_pairwises_allsp.append(pairwise) # all pairwises in the same 1D list + return list_pairwises_allsp + +""" Proceeds to create orthogroups by putting together pairwise couples sharing a locus. + Iterates over the orthogroups list and tag to 'True' the pairwise couple already gathered in a group to avoid + redondancy. Writes each orthogroup in a fasta file + Returns an integer (a list length) """ +def makeOrthogroups(list_pairwises_allsp, minspec, nb_rbh, verbose, paralogs): + + # Sub-funtions + + """ Check if a locus/group has already been treated in makeOrthogroups() + Returns a boolean """ + def checkIfTagged(pair): + tag = True + for element in pair: + if not element.getTag() and tag: # use a list comprehension maybe ? + tag = False + return tag + + """ True means a locus/group has already been treated in makeOrthogroups() + A stronger code would be to implement a method inside the class Locus """ + def tagGroup(pair): + for element in pair: + element.tagged = True + + """ Write an orthogroup in a file """ + def writeOutputFile(orthogroup, number, naming): + name = "" + if naming: + name = "orthogroup_{}_with_{}_sequences_withParalogs.fasta".format(number, len(orthogroup)) + else : + name = "orthogroup_{}_with_{}_sequences.fasta".format(number, len(orthogroup)) + result = open(name, "w") + with result: + for locus in orthogroup: + if locus.getHeader()[-1] == "\n": + result.write("%s" % locus.getHeader()) # write geneID + else : + result.write("%s\n" % locus.Header()) # write geneID + if locus.getSequence()[-1] == "\n": + result.write("%s" % locus.getSequence()) # write sequence + else : + result.write("%s\n" % locus.getSequence()) # write sequence + if naming: + os.system("mv {} outputs_withParalogs/".format(name)) + else : + os.system("mv {} outputs/".format(name)) + + """ Parse an orthogroup list to keep only one paralog sequence per species & per group + (Keeps the 1st paralogous encoutered) + Returns a list """ + def filterParalogs(list_orthogroups, minspec): + list_orthogroups_format = [] + j = 1 + + for nofilter_group in list_orthogroups: + new_group = [] + species = {} + for loci in nofilter_group: + species[loci.getHeader()[1:3]] = False + for loci in nofilter_group: + if not species[loci.getHeader()[1:3]]: + new_group.append(loci) + species[loci.getHeader()[1:3]] = True + + if len(new_group) >= minspec: # Drop too small orthogroups + list_orthogroups_format.append(new_group) + writeOutputFile(new_group, j, False) + j += 1 + + return list_orthogroups_format + + """ Builds a 2D array for a summary + Returns a numpy 2D array """ + def countings(listOrthogroups, nb_rbh): + + def compute_nbspec(nb_rbh): + + def factorielle(x): + n = 1 + s = 0 + while n <= x: + s += n + n += 1 + return s + + x = 2 + nb_specs = 0 + while x*x - factorielle(x) < nb_rbh: + x += 1 + return x + #listOrthogroups.sort().reverse() + #nblines = len(listOrthogroups[0]) + nblines = 0 + for group in listOrthogroups: + if len(group) > nblines: + nblines = len(group) + matrix = np.array([[0]*compute_nbspec(nb_rbh)]*nblines) + + for group in listOrthogroups: + listSpecs = [] + for loci in group: + if loci.getHeader()[1:3] not in listSpecs: + listSpecs.append(loci.getHeader()[1:3]) + matrix[len(group)-1][len(listSpecs)-1] += 1 + + return matrix + + """ numpy 2D array in a nice dataframe + Returns a pandas 2D dataframe """ + def asFrame(matrix) : + index = [0]*len(matrix) + colnames = [0]*len(matrix[0]) + index = [str(i+1)+" seqs" for i in range(len(matrix))] + colnames = [str(i+1)+" sps" for i in range(len(matrix[0]))] + df = pd.DataFrame(matrix, index=index, columns=colnames) + return df # Mettre une selection pour ne renvoyer que les lignes et les colonnes qui somment > 0 + #return df.loc['4 seqs':'9 seqs'].loc[:,colnames[3:]] + + # Function ------------------------------------------------------------------------------------------------- + list_orthogroups = [] + + for ortho_pair1 in list_pairwises_allsp[0:-1]: + if not checkIfTagged(ortho_pair1): + orthogroup = ortho_pair1 # the orthogroup grows as we go throught the second loop + + # check for common locus between two groups + for ortho_pair2 in list_pairwises_allsp[list_pairwises_allsp.index(ortho_pair1) + 1:]: + if len(orthogroup.intersection(ortho_pair2)) != 0 and not checkIfTagged(ortho_pair2): + orthogroup.update(orthogroup | ortho_pair2) + tagGroup(ortho_pair2) + + # Check if subgroup is already computed + if len(list_orthogroups) > 0: + presence = False + for group in list_orthogroups: + if len(group.intersection(orthogroup)) != 0: + group.update(group | orthogroup) + presence = True + if not presence: + list_orthogroups.append(orthogroup) + else: + list_orthogroups.append(orthogroup) + + # Options -------------------------------------------------------------------------------------------------- + + """ nb : I could try to implement a more complex code which does in the same previous loop all the following lines, to avoid multiples parsing of + the orthogroups list, but the code would become hardly readable. Since the whole program is already quite fast, I chosed code simplicity + over code efficiency """ + + # Print summary table with all paralogs + if verbose : + frame = countings(list_orthogroups, nb_rbh) + df = asFrame(frame) + print "\n Summary before paralogous filtering : \n" + print df.loc[df.ne(0).any(1),df.ne(0).any()], "\n" # Don't display columns and lines filled with 0 + + # Write outputFile with all the paralogous + if paralogs: + print "Writing orthogroups with paralogs files ...\n" + j = 1 + for group in list_orthogroups: + if len(group) >= minspec: + writeOutputFile(group, j, True) + j += 1 + + # Paralogs filtering and summary ---------------------------------------------------------------------------- + + print "Filtering paralogous sequences and writing final orthogroups files ..." + print " (Dropping Orthogroups with less than {} species)".format(minspec) + + # writeOutputFile() is called in filterParalogs() + list_orthogroups_format = filterParalogs(list_orthogroups, minspec) + + frame = countings(list_orthogroups_format, nb_rbh) + df = asFrame(frame) + print "\n Summary after paralogous filtering : \n" + print df.loc[df.ne(0).any(1),df.ne(0).any()] + + #return only the length of the list (at this point the program doesn't need more) + return len(list_orthogroups_format) + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("files", help="Input files separated by commas. Each file contains all the reciprocical best hits between a pair of species") + parser.add_argument("minspec", help="Only keep Orthogroups with at least this number of species", type=int) + parser.add_argument("-v", "--verbose", action="store_true", help="A supplemental summary table of orthogroups before paralogs filtering will be returned") + parser.add_argument("-p", "--paralogs", action="store_true", help="Proceeds to write orthogroups also before paralogous filtering") + args = parser.parse_args() + + print "*** pogs.py ***" + print "\nBuilding of orthogroups based on pairs of genes obtained by pairwise comparisons between pairs of species." + print "Genes are gathered in orthogroups based on the principle of transitivity between genes pairs." + + os.system("mkdir outputs") + if args.paralogs: os.system("mkdir outputs_withParalogs") + infiles = args.files + listPairwiseFiles = str.split(infiles, ",") + print "\nParsing input files ..." + list_Locus = getListPairwiseAll(listPairwiseFiles) + print "Creating Orthogroups ..." + nb_orthogroups = makeOrthogroups(list_Locus, args.minspec, len(listPairwiseFiles), args.verbose, args.paralogs) + print "\n{} orthogroups have been infered from {} pairwise comparisons by RBH\n".format(nb_orthogroups, len(listPairwiseFiles)) + +if __name__ == "__main__": + main()