Mercurial > repos > abims-sbr > pogs
view scripts/pogs.py @ 0:7bee3426a442 draft default tip
planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
author | abims-sbr |
---|---|
date | Fri, 01 Feb 2019 10:24:29 -0500 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env python # coding: utf8 # September 2017 - Author : Victor Mataigne (Station Biologique de Roscoff - ABiMS) # Command line : ./pogsPOO.py <list_of_input_files_separated_by_commas> <minimal number of species per group> [-v) [-p] """ What it does: - pogs.py parses output files from the "pairwise" tool of the AdaptSearch suite and proceeds to gather genes in orthogroups (using transitivity). - A minimal number of species per group has to be set. BETA VERSION """ import os, argparse, itertools import numpy as np import pandas as pd """ Definition of a locus : header + sequence + a tag """ class Locus: def __init__(self, header, sequence): self.header = header self.sequence = sequence self.tagged = False def __str__(self): return "{}{}".format(self.header, self.sequence) def __eq__(self, other): return self.getHeader() == other.getHeader() # Test if two loci are the same def __hash__(self): return hash((self.header, self.sequence)) # Make the object iterable and hashable def getHeader(self): return self.header def getSequence(self): return self.sequence def getTag(self): return self.tagged def prettyPrint(self): # Used for debugging : print "{ Header : ", self.header[0:-1], "Tag : ", self.tagged, " }" print "[ Header : {header} ]".format(header=self.header[0:-1]) def prettyPrint2(self): print "[ Header : {header} Sequence : {sequence} ]".format(header=self.header[0:-1], sequence=self.sequence[0:-1]) """ Applies the getPairwiseCouple() function to a list of files and return a big list with ALL pairwises couples Returns a list of sets (2 items per set) """ def getListPairwiseAll(listPairwiseFiles): # Sub-Function """ Reads an output file from the 'Pairwise' tool (AdaptSearch suite) and returns its content into a list Returns a list of sets (2 items per set) """ def getPairwiseCouple(pairwiseFile): list_pairwises_2sp = [] with open(pairwiseFile, "r") as file: for name, sequence, name2, sequence2 in itertools.izip_longest(*[file]*4): # One locus every two lines (one pairwise couple = 4 lines) : header + sequence locus1 = Locus(name, sequence) locus2 = Locus(name2, sequence2) group = set([]) group.add(locus1) group.add(locus2) list_pairwises_2sp.append(group) return (list_pairwises_2sp) # Function list_pairwises_allsp = [] for file in listPairwiseFiles: listPairwises = getPairwiseCouple(file) for pairwise in listPairwises: list_pairwises_allsp.append(pairwise) # all pairwises in the same 1D list return list_pairwises_allsp """ Proceeds to create orthogroups by putting together pairwise couples sharing a locus. Iterates over the orthogroups list and tag to 'True' the pairwise couple already gathered in a group to avoid redondancy. Writes each orthogroup in a fasta file Returns an integer (a list length) """ def makeOrthogroups(list_pairwises_allsp, minspec, nb_rbh, verbose, paralogs): # Sub-funtions """ Check if a locus/group has already been treated in makeOrthogroups() Returns a boolean """ def checkIfTagged(pair): tag = True for element in pair: if not element.getTag() and tag: # use a list comprehension maybe ? tag = False return tag """ True means a locus/group has already been treated in makeOrthogroups() A stronger code would be to implement a method inside the class Locus """ def tagGroup(pair): for element in pair: element.tagged = True """ Write an orthogroup in a file """ def writeOutputFile(orthogroup, number, naming): name = "" if naming: name = "orthogroup_{}_with_{}_sequences_withParalogs.fasta".format(number, len(orthogroup)) else : name = "orthogroup_{}_with_{}_sequences.fasta".format(number, len(orthogroup)) result = open(name, "w") with result: for locus in orthogroup: if locus.getHeader()[-1] == "\n": result.write("%s" % locus.getHeader()) # write geneID else : result.write("%s\n" % locus.Header()) # write geneID if locus.getSequence()[-1] == "\n": result.write("%s" % locus.getSequence()) # write sequence else : result.write("%s\n" % locus.getSequence()) # write sequence if naming: os.system("mv {} outputs_withParalogs/".format(name)) else : os.system("mv {} outputs/".format(name)) """ Parse an orthogroup list to keep only one paralog sequence per species & per group (Keeps the 1st paralogous encoutered) Returns a list """ def filterParalogs(list_orthogroups, minspec): list_orthogroups_format = [] j = 1 for nofilter_group in list_orthogroups: new_group = [] species = {} for loci in nofilter_group: species[loci.getHeader()[1:3]] = False for loci in nofilter_group: if not species[loci.getHeader()[1:3]]: new_group.append(loci) species[loci.getHeader()[1:3]] = True if len(new_group) >= minspec: # Drop too small orthogroups list_orthogroups_format.append(new_group) writeOutputFile(new_group, j, False) j += 1 return list_orthogroups_format """ Builds a 2D array for a summary Returns a numpy 2D array """ def countings(listOrthogroups, nb_rbh): def compute_nbspec(nb_rbh): def factorielle(x): n = 1 s = 0 while n <= x: s += n n += 1 return s x = 2 nb_specs = 0 while x*x - factorielle(x) < nb_rbh: x += 1 return x #listOrthogroups.sort().reverse() #nblines = len(listOrthogroups[0]) nblines = 0 for group in listOrthogroups: if len(group) > nblines: nblines = len(group) matrix = np.array([[0]*compute_nbspec(nb_rbh)]*nblines) for group in listOrthogroups: listSpecs = [] for loci in group: if loci.getHeader()[1:3] not in listSpecs: listSpecs.append(loci.getHeader()[1:3]) matrix[len(group)-1][len(listSpecs)-1] += 1 return matrix """ numpy 2D array in a nice dataframe Returns a pandas 2D dataframe """ def asFrame(matrix) : index = [0]*len(matrix) colnames = [0]*len(matrix[0]) index = [str(i+1)+" seqs" for i in range(len(matrix))] colnames = [str(i+1)+" sps" for i in range(len(matrix[0]))] df = pd.DataFrame(matrix, index=index, columns=colnames) return df # Mettre une selection pour ne renvoyer que les lignes et les colonnes qui somment > 0 #return df.loc['4 seqs':'9 seqs'].loc[:,colnames[3:]] # Function ------------------------------------------------------------------------------------------------- list_orthogroups = [] for ortho_pair1 in list_pairwises_allsp[0:-1]: if not checkIfTagged(ortho_pair1): orthogroup = ortho_pair1 # the orthogroup grows as we go throught the second loop # check for common locus between two groups for ortho_pair2 in list_pairwises_allsp[list_pairwises_allsp.index(ortho_pair1) + 1:]: if len(orthogroup.intersection(ortho_pair2)) != 0 and not checkIfTagged(ortho_pair2): orthogroup.update(orthogroup | ortho_pair2) tagGroup(ortho_pair2) # Check if subgroup is already computed if len(list_orthogroups) > 0: presence = False for group in list_orthogroups: if len(group.intersection(orthogroup)) != 0: group.update(group | orthogroup) presence = True if not presence: list_orthogroups.append(orthogroup) else: list_orthogroups.append(orthogroup) # Options -------------------------------------------------------------------------------------------------- """ nb : I could try to implement a more complex code which does in the same previous loop all the following lines, to avoid multiples parsing of the orthogroups list, but the code would become hardly readable. Since the whole program is already quite fast, I chosed code simplicity over code efficiency """ # Print summary table with all paralogs if verbose : frame = countings(list_orthogroups, nb_rbh) df = asFrame(frame) print "\n Summary before paralogous filtering : \n" print df.loc[df.ne(0).any(1),df.ne(0).any()], "\n" # Don't display columns and lines filled with 0 # Write outputFile with all the paralogous if paralogs: print "Writing orthogroups with paralogs files ...\n" j = 1 for group in list_orthogroups: if len(group) >= minspec: writeOutputFile(group, j, True) j += 1 # Paralogs filtering and summary ---------------------------------------------------------------------------- print "Filtering paralogous sequences and writing final orthogroups files ..." print " (Dropping Orthogroups with less than {} species)".format(minspec) # writeOutputFile() is called in filterParalogs() list_orthogroups_format = filterParalogs(list_orthogroups, minspec) frame = countings(list_orthogroups_format, nb_rbh) df = asFrame(frame) print "\n Summary after paralogous filtering : \n" print df.loc[df.ne(0).any(1),df.ne(0).any()] #return only the length of the list (at this point the program doesn't need more) return len(list_orthogroups_format) def main(): parser = argparse.ArgumentParser() parser.add_argument("files", help="Input files separated by commas. Each file contains all the reciprocical best hits between a pair of species") parser.add_argument("minspec", help="Only keep Orthogroups with at least this number of species", type=int) parser.add_argument("-v", "--verbose", action="store_true", help="A supplemental summary table of orthogroups before paralogs filtering will be returned") parser.add_argument("-p", "--paralogs", action="store_true", help="Proceeds to write orthogroups also before paralogous filtering") args = parser.parse_args() print "*** pogs.py ***" print "\nBuilding of orthogroups based on pairs of genes obtained by pairwise comparisons between pairs of species." print "Genes are gathered in orthogroups based on the principle of transitivity between genes pairs." os.system("mkdir outputs") if args.paralogs: os.system("mkdir outputs_withParalogs") infiles = args.files listPairwiseFiles = str.split(infiles, ",") print "\nParsing input files ..." list_Locus = getListPairwiseAll(listPairwiseFiles) print "Creating Orthogroups ..." nb_orthogroups = makeOrthogroups(list_Locus, args.minspec, len(listPairwiseFiles), args.verbose, args.paralogs) print "\n{} orthogroups have been infered from {} pairwise comparisons by RBH\n".format(nb_orthogroups, len(listPairwiseFiles)) if __name__ == "__main__": main()