Mercurial > repos > abims-sbr > pogs
comparison scripts/pogs.py @ 0:7bee3426a442 draft default tip
planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
| author | abims-sbr |
|---|---|
| date | Fri, 01 Feb 2019 10:24:29 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:7bee3426a442 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 # coding: utf8 | |
| 3 # September 2017 - Author : Victor Mataigne (Station Biologique de Roscoff - ABiMS) | |
| 4 # Command line : ./pogsPOO.py <list_of_input_files_separated_by_commas> <minimal number of species per group> [-v) [-p] | |
| 5 | |
| 6 """ | |
| 7 What it does: | |
| 8 - pogs.py parses output files from the "pairwise" tool of the AdaptSearch suite and proceeds to gather genes in orthogroups (using transitivity). | |
| 9 - A minimal number of species per group has to be set. | |
| 10 | |
| 11 BETA VERSION | |
| 12 """ | |
| 13 | |
| 14 import os, argparse, itertools | |
| 15 import numpy as np | |
| 16 import pandas as pd | |
| 17 | |
| 18 """ Definition of a locus : header + sequence + a tag """ | |
| 19 class Locus: | |
| 20 | |
| 21 def __init__(self, header, sequence): | |
| 22 self.header = header | |
| 23 self.sequence = sequence | |
| 24 self.tagged = False | |
| 25 | |
| 26 def __str__(self): | |
| 27 return "{}{}".format(self.header, self.sequence) | |
| 28 | |
| 29 def __eq__(self, other): | |
| 30 return self.getHeader() == other.getHeader() # Test if two loci are the same | |
| 31 | |
| 32 def __hash__(self): | |
| 33 return hash((self.header, self.sequence)) # Make the object iterable and hashable | |
| 34 | |
| 35 def getHeader(self): | |
| 36 return self.header | |
| 37 | |
| 38 def getSequence(self): | |
| 39 return self.sequence | |
| 40 | |
| 41 def getTag(self): | |
| 42 return self.tagged | |
| 43 | |
| 44 def prettyPrint(self): | |
| 45 # Used for debugging : print "{ Header : ", self.header[0:-1], "Tag : ", self.tagged, " }" | |
| 46 print "[ Header : {header} ]".format(header=self.header[0:-1]) | |
| 47 | |
| 48 def prettyPrint2(self): | |
| 49 print "[ Header : {header} Sequence : {sequence} ]".format(header=self.header[0:-1], sequence=self.sequence[0:-1]) | |
| 50 | |
| 51 """ Applies the getPairwiseCouple() function to a list of files and return a big list with ALL pairwises couples | |
| 52 Returns a list of sets (2 items per set) """ | |
| 53 def getListPairwiseAll(listPairwiseFiles): | |
| 54 | |
| 55 # Sub-Function | |
| 56 | |
| 57 """ Reads an output file from the 'Pairwise' tool (AdaptSearch suite) and returns its content into a list | |
| 58 Returns a list of sets (2 items per set) """ | |
| 59 def getPairwiseCouple(pairwiseFile): | |
| 60 list_pairwises_2sp = [] | |
| 61 with open(pairwiseFile, "r") as file: | |
| 62 for name, sequence, name2, sequence2 in itertools.izip_longest(*[file]*4): | |
| 63 # One locus every two lines (one pairwise couple = 4 lines) : header + sequence | |
| 64 locus1 = Locus(name, sequence) | |
| 65 locus2 = Locus(name2, sequence2) | |
| 66 group = set([]) | |
| 67 group.add(locus1) | |
| 68 group.add(locus2) | |
| 69 list_pairwises_2sp.append(group) | |
| 70 return (list_pairwises_2sp) | |
| 71 | |
| 72 # Function | |
| 73 list_pairwises_allsp = [] | |
| 74 for file in listPairwiseFiles: | |
| 75 listPairwises = getPairwiseCouple(file) | |
| 76 for pairwise in listPairwises: | |
| 77 list_pairwises_allsp.append(pairwise) # all pairwises in the same 1D list | |
| 78 return list_pairwises_allsp | |
| 79 | |
| 80 """ Proceeds to create orthogroups by putting together pairwise couples sharing a locus. | |
| 81 Iterates over the orthogroups list and tag to 'True' the pairwise couple already gathered in a group to avoid | |
| 82 redondancy. Writes each orthogroup in a fasta file | |
| 83 Returns an integer (a list length) """ | |
| 84 def makeOrthogroups(list_pairwises_allsp, minspec, nb_rbh, verbose, paralogs): | |
| 85 | |
| 86 # Sub-funtions | |
| 87 | |
| 88 """ Check if a locus/group has already been treated in makeOrthogroups() | |
| 89 Returns a boolean """ | |
| 90 def checkIfTagged(pair): | |
| 91 tag = True | |
| 92 for element in pair: | |
| 93 if not element.getTag() and tag: # use a list comprehension maybe ? | |
| 94 tag = False | |
| 95 return tag | |
| 96 | |
| 97 """ True means a locus/group has already been treated in makeOrthogroups() | |
| 98 A stronger code would be to implement a method inside the class Locus """ | |
| 99 def tagGroup(pair): | |
| 100 for element in pair: | |
| 101 element.tagged = True | |
| 102 | |
| 103 """ Write an orthogroup in a file """ | |
| 104 def writeOutputFile(orthogroup, number, naming): | |
| 105 name = "" | |
| 106 if naming: | |
| 107 name = "orthogroup_{}_with_{}_sequences_withParalogs.fasta".format(number, len(orthogroup)) | |
| 108 else : | |
| 109 name = "orthogroup_{}_with_{}_sequences.fasta".format(number, len(orthogroup)) | |
| 110 result = open(name, "w") | |
| 111 with result: | |
| 112 for locus in orthogroup: | |
| 113 if locus.getHeader()[-1] == "\n": | |
| 114 result.write("%s" % locus.getHeader()) # write geneID | |
| 115 else : | |
| 116 result.write("%s\n" % locus.Header()) # write geneID | |
| 117 if locus.getSequence()[-1] == "\n": | |
| 118 result.write("%s" % locus.getSequence()) # write sequence | |
| 119 else : | |
| 120 result.write("%s\n" % locus.getSequence()) # write sequence | |
| 121 if naming: | |
| 122 os.system("mv {} outputs_withParalogs/".format(name)) | |
| 123 else : | |
| 124 os.system("mv {} outputs/".format(name)) | |
| 125 | |
| 126 """ Parse an orthogroup list to keep only one paralog sequence per species & per group | |
| 127 (Keeps the 1st paralogous encoutered) | |
| 128 Returns a list """ | |
| 129 def filterParalogs(list_orthogroups, minspec): | |
| 130 list_orthogroups_format = [] | |
| 131 j = 1 | |
| 132 | |
| 133 for nofilter_group in list_orthogroups: | |
| 134 new_group = [] | |
| 135 species = {} | |
| 136 for loci in nofilter_group: | |
| 137 species[loci.getHeader()[1:3]] = False | |
| 138 for loci in nofilter_group: | |
| 139 if not species[loci.getHeader()[1:3]]: | |
| 140 new_group.append(loci) | |
| 141 species[loci.getHeader()[1:3]] = True | |
| 142 | |
| 143 if len(new_group) >= minspec: # Drop too small orthogroups | |
| 144 list_orthogroups_format.append(new_group) | |
| 145 writeOutputFile(new_group, j, False) | |
| 146 j += 1 | |
| 147 | |
| 148 return list_orthogroups_format | |
| 149 | |
| 150 """ Builds a 2D array for a summary | |
| 151 Returns a numpy 2D array """ | |
| 152 def countings(listOrthogroups, nb_rbh): | |
| 153 | |
| 154 def compute_nbspec(nb_rbh): | |
| 155 | |
| 156 def factorielle(x): | |
| 157 n = 1 | |
| 158 s = 0 | |
| 159 while n <= x: | |
| 160 s += n | |
| 161 n += 1 | |
| 162 return s | |
| 163 | |
| 164 x = 2 | |
| 165 nb_specs = 0 | |
| 166 while x*x - factorielle(x) < nb_rbh: | |
| 167 x += 1 | |
| 168 return x | |
| 169 #listOrthogroups.sort().reverse() | |
| 170 #nblines = len(listOrthogroups[0]) | |
| 171 nblines = 0 | |
| 172 for group in listOrthogroups: | |
| 173 if len(group) > nblines: | |
| 174 nblines = len(group) | |
| 175 matrix = np.array([[0]*compute_nbspec(nb_rbh)]*nblines) | |
| 176 | |
| 177 for group in listOrthogroups: | |
| 178 listSpecs = [] | |
| 179 for loci in group: | |
| 180 if loci.getHeader()[1:3] not in listSpecs: | |
| 181 listSpecs.append(loci.getHeader()[1:3]) | |
| 182 matrix[len(group)-1][len(listSpecs)-1] += 1 | |
| 183 | |
| 184 return matrix | |
| 185 | |
| 186 """ numpy 2D array in a nice dataframe | |
| 187 Returns a pandas 2D dataframe """ | |
| 188 def asFrame(matrix) : | |
| 189 index = [0]*len(matrix) | |
| 190 colnames = [0]*len(matrix[0]) | |
| 191 index = [str(i+1)+" seqs" for i in range(len(matrix))] | |
| 192 colnames = [str(i+1)+" sps" for i in range(len(matrix[0]))] | |
| 193 df = pd.DataFrame(matrix, index=index, columns=colnames) | |
| 194 return df # Mettre une selection pour ne renvoyer que les lignes et les colonnes qui somment > 0 | |
| 195 #return df.loc['4 seqs':'9 seqs'].loc[:,colnames[3:]] | |
| 196 | |
| 197 # Function ------------------------------------------------------------------------------------------------- | |
| 198 list_orthogroups = [] | |
| 199 | |
| 200 for ortho_pair1 in list_pairwises_allsp[0:-1]: | |
| 201 if not checkIfTagged(ortho_pair1): | |
| 202 orthogroup = ortho_pair1 # the orthogroup grows as we go throught the second loop | |
| 203 | |
| 204 # check for common locus between two groups | |
| 205 for ortho_pair2 in list_pairwises_allsp[list_pairwises_allsp.index(ortho_pair1) + 1:]: | |
| 206 if len(orthogroup.intersection(ortho_pair2)) != 0 and not checkIfTagged(ortho_pair2): | |
| 207 orthogroup.update(orthogroup | ortho_pair2) | |
| 208 tagGroup(ortho_pair2) | |
| 209 | |
| 210 # Check if subgroup is already computed | |
| 211 if len(list_orthogroups) > 0: | |
| 212 presence = False | |
| 213 for group in list_orthogroups: | |
| 214 if len(group.intersection(orthogroup)) != 0: | |
| 215 group.update(group | orthogroup) | |
| 216 presence = True | |
| 217 if not presence: | |
| 218 list_orthogroups.append(orthogroup) | |
| 219 else: | |
| 220 list_orthogroups.append(orthogroup) | |
| 221 | |
| 222 # Options -------------------------------------------------------------------------------------------------- | |
| 223 | |
| 224 """ nb : I could try to implement a more complex code which does in the same previous loop all the following lines, to avoid multiples parsing of | |
| 225 the orthogroups list, but the code would become hardly readable. Since the whole program is already quite fast, I chosed code simplicity | |
| 226 over code efficiency """ | |
| 227 | |
| 228 # Print summary table with all paralogs | |
| 229 if verbose : | |
| 230 frame = countings(list_orthogroups, nb_rbh) | |
| 231 df = asFrame(frame) | |
| 232 print "\n Summary before paralogous filtering : \n" | |
| 233 print df.loc[df.ne(0).any(1),df.ne(0).any()], "\n" # Don't display columns and lines filled with 0 | |
| 234 | |
| 235 # Write outputFile with all the paralogous | |
| 236 if paralogs: | |
| 237 print "Writing orthogroups with paralogs files ...\n" | |
| 238 j = 1 | |
| 239 for group in list_orthogroups: | |
| 240 if len(group) >= minspec: | |
| 241 writeOutputFile(group, j, True) | |
| 242 j += 1 | |
| 243 | |
| 244 # Paralogs filtering and summary ---------------------------------------------------------------------------- | |
| 245 | |
| 246 print "Filtering paralogous sequences and writing final orthogroups files ..." | |
| 247 print " (Dropping Orthogroups with less than {} species)".format(minspec) | |
| 248 | |
| 249 # writeOutputFile() is called in filterParalogs() | |
| 250 list_orthogroups_format = filterParalogs(list_orthogroups, minspec) | |
| 251 | |
| 252 frame = countings(list_orthogroups_format, nb_rbh) | |
| 253 df = asFrame(frame) | |
| 254 print "\n Summary after paralogous filtering : \n" | |
| 255 print df.loc[df.ne(0).any(1),df.ne(0).any()] | |
| 256 | |
| 257 #return only the length of the list (at this point the program doesn't need more) | |
| 258 return len(list_orthogroups_format) | |
| 259 | |
| 260 def main(): | |
| 261 parser = argparse.ArgumentParser() | |
| 262 parser.add_argument("files", help="Input files separated by commas. Each file contains all the reciprocical best hits between a pair of species") | |
| 263 parser.add_argument("minspec", help="Only keep Orthogroups with at least this number of species", type=int) | |
| 264 parser.add_argument("-v", "--verbose", action="store_true", help="A supplemental summary table of orthogroups before paralogs filtering will be returned") | |
| 265 parser.add_argument("-p", "--paralogs", action="store_true", help="Proceeds to write orthogroups also before paralogous filtering") | |
| 266 args = parser.parse_args() | |
| 267 | |
| 268 print "*** pogs.py ***" | |
| 269 print "\nBuilding of orthogroups based on pairs of genes obtained by pairwise comparisons between pairs of species." | |
| 270 print "Genes are gathered in orthogroups based on the principle of transitivity between genes pairs." | |
| 271 | |
| 272 os.system("mkdir outputs") | |
| 273 if args.paralogs: os.system("mkdir outputs_withParalogs") | |
| 274 infiles = args.files | |
| 275 listPairwiseFiles = str.split(infiles, ",") | |
| 276 print "\nParsing input files ..." | |
| 277 list_Locus = getListPairwiseAll(listPairwiseFiles) | |
| 278 print "Creating Orthogroups ..." | |
| 279 nb_orthogroups = makeOrthogroups(list_Locus, args.minspec, len(listPairwiseFiles), args.verbose, args.paralogs) | |
| 280 print "\n{} orthogroups have been infered from {} pairwise comparisons by RBH\n".format(nb_orthogroups, len(listPairwiseFiles)) | |
| 281 | |
| 282 if __name__ == "__main__": | |
| 283 main() |
