comparison scripts/pogs.py @ 0:7bee3426a442 draft default tip

planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
author abims-sbr
date Fri, 01 Feb 2019 10:24:29 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:7bee3426a442
1 #!/usr/bin/env python
2 # coding: utf8
3 # September 2017 - Author : Victor Mataigne (Station Biologique de Roscoff - ABiMS)
4 # Command line : ./pogsPOO.py <list_of_input_files_separated_by_commas> <minimal number of species per group> [-v) [-p]
5
6 """
7 What it does:
8 - pogs.py parses output files from the "pairwise" tool of the AdaptSearch suite and proceeds to gather genes in orthogroups (using transitivity).
9 - A minimal number of species per group has to be set.
10
11 BETA VERSION
12 """
13
14 import os, argparse, itertools
15 import numpy as np
16 import pandas as pd
17
18 """ Definition of a locus : header + sequence + a tag """
19 class Locus:
20
21 def __init__(self, header, sequence):
22 self.header = header
23 self.sequence = sequence
24 self.tagged = False
25
26 def __str__(self):
27 return "{}{}".format(self.header, self.sequence)
28
29 def __eq__(self, other):
30 return self.getHeader() == other.getHeader() # Test if two loci are the same
31
32 def __hash__(self):
33 return hash((self.header, self.sequence)) # Make the object iterable and hashable
34
35 def getHeader(self):
36 return self.header
37
38 def getSequence(self):
39 return self.sequence
40
41 def getTag(self):
42 return self.tagged
43
44 def prettyPrint(self):
45 # Used for debugging : print "{ Header : ", self.header[0:-1], "Tag : ", self.tagged, " }"
46 print "[ Header : {header} ]".format(header=self.header[0:-1])
47
48 def prettyPrint2(self):
49 print "[ Header : {header} Sequence : {sequence} ]".format(header=self.header[0:-1], sequence=self.sequence[0:-1])
50
51 """ Applies the getPairwiseCouple() function to a list of files and return a big list with ALL pairwises couples
52 Returns a list of sets (2 items per set) """
53 def getListPairwiseAll(listPairwiseFiles):
54
55 # Sub-Function
56
57 """ Reads an output file from the 'Pairwise' tool (AdaptSearch suite) and returns its content into a list
58 Returns a list of sets (2 items per set) """
59 def getPairwiseCouple(pairwiseFile):
60 list_pairwises_2sp = []
61 with open(pairwiseFile, "r") as file:
62 for name, sequence, name2, sequence2 in itertools.izip_longest(*[file]*4):
63 # One locus every two lines (one pairwise couple = 4 lines) : header + sequence
64 locus1 = Locus(name, sequence)
65 locus2 = Locus(name2, sequence2)
66 group = set([])
67 group.add(locus1)
68 group.add(locus2)
69 list_pairwises_2sp.append(group)
70 return (list_pairwises_2sp)
71
72 # Function
73 list_pairwises_allsp = []
74 for file in listPairwiseFiles:
75 listPairwises = getPairwiseCouple(file)
76 for pairwise in listPairwises:
77 list_pairwises_allsp.append(pairwise) # all pairwises in the same 1D list
78 return list_pairwises_allsp
79
80 """ Proceeds to create orthogroups by putting together pairwise couples sharing a locus.
81 Iterates over the orthogroups list and tag to 'True' the pairwise couple already gathered in a group to avoid
82 redondancy. Writes each orthogroup in a fasta file
83 Returns an integer (a list length) """
84 def makeOrthogroups(list_pairwises_allsp, minspec, nb_rbh, verbose, paralogs):
85
86 # Sub-funtions
87
88 """ Check if a locus/group has already been treated in makeOrthogroups()
89 Returns a boolean """
90 def checkIfTagged(pair):
91 tag = True
92 for element in pair:
93 if not element.getTag() and tag: # use a list comprehension maybe ?
94 tag = False
95 return tag
96
97 """ True means a locus/group has already been treated in makeOrthogroups()
98 A stronger code would be to implement a method inside the class Locus """
99 def tagGroup(pair):
100 for element in pair:
101 element.tagged = True
102
103 """ Write an orthogroup in a file """
104 def writeOutputFile(orthogroup, number, naming):
105 name = ""
106 if naming:
107 name = "orthogroup_{}_with_{}_sequences_withParalogs.fasta".format(number, len(orthogroup))
108 else :
109 name = "orthogroup_{}_with_{}_sequences.fasta".format(number, len(orthogroup))
110 result = open(name, "w")
111 with result:
112 for locus in orthogroup:
113 if locus.getHeader()[-1] == "\n":
114 result.write("%s" % locus.getHeader()) # write geneID
115 else :
116 result.write("%s\n" % locus.Header()) # write geneID
117 if locus.getSequence()[-1] == "\n":
118 result.write("%s" % locus.getSequence()) # write sequence
119 else :
120 result.write("%s\n" % locus.getSequence()) # write sequence
121 if naming:
122 os.system("mv {} outputs_withParalogs/".format(name))
123 else :
124 os.system("mv {} outputs/".format(name))
125
126 """ Parse an orthogroup list to keep only one paralog sequence per species & per group
127 (Keeps the 1st paralogous encoutered)
128 Returns a list """
129 def filterParalogs(list_orthogroups, minspec):
130 list_orthogroups_format = []
131 j = 1
132
133 for nofilter_group in list_orthogroups:
134 new_group = []
135 species = {}
136 for loci in nofilter_group:
137 species[loci.getHeader()[1:3]] = False
138 for loci in nofilter_group:
139 if not species[loci.getHeader()[1:3]]:
140 new_group.append(loci)
141 species[loci.getHeader()[1:3]] = True
142
143 if len(new_group) >= minspec: # Drop too small orthogroups
144 list_orthogroups_format.append(new_group)
145 writeOutputFile(new_group, j, False)
146 j += 1
147
148 return list_orthogroups_format
149
150 """ Builds a 2D array for a summary
151 Returns a numpy 2D array """
152 def countings(listOrthogroups, nb_rbh):
153
154 def compute_nbspec(nb_rbh):
155
156 def factorielle(x):
157 n = 1
158 s = 0
159 while n <= x:
160 s += n
161 n += 1
162 return s
163
164 x = 2
165 nb_specs = 0
166 while x*x - factorielle(x) < nb_rbh:
167 x += 1
168 return x
169 #listOrthogroups.sort().reverse()
170 #nblines = len(listOrthogroups[0])
171 nblines = 0
172 for group in listOrthogroups:
173 if len(group) > nblines:
174 nblines = len(group)
175 matrix = np.array([[0]*compute_nbspec(nb_rbh)]*nblines)
176
177 for group in listOrthogroups:
178 listSpecs = []
179 for loci in group:
180 if loci.getHeader()[1:3] not in listSpecs:
181 listSpecs.append(loci.getHeader()[1:3])
182 matrix[len(group)-1][len(listSpecs)-1] += 1
183
184 return matrix
185
186 """ numpy 2D array in a nice dataframe
187 Returns a pandas 2D dataframe """
188 def asFrame(matrix) :
189 index = [0]*len(matrix)
190 colnames = [0]*len(matrix[0])
191 index = [str(i+1)+" seqs" for i in range(len(matrix))]
192 colnames = [str(i+1)+" sps" for i in range(len(matrix[0]))]
193 df = pd.DataFrame(matrix, index=index, columns=colnames)
194 return df # Mettre une selection pour ne renvoyer que les lignes et les colonnes qui somment > 0
195 #return df.loc['4 seqs':'9 seqs'].loc[:,colnames[3:]]
196
197 # Function -------------------------------------------------------------------------------------------------
198 list_orthogroups = []
199
200 for ortho_pair1 in list_pairwises_allsp[0:-1]:
201 if not checkIfTagged(ortho_pair1):
202 orthogroup = ortho_pair1 # the orthogroup grows as we go throught the second loop
203
204 # check for common locus between two groups
205 for ortho_pair2 in list_pairwises_allsp[list_pairwises_allsp.index(ortho_pair1) + 1:]:
206 if len(orthogroup.intersection(ortho_pair2)) != 0 and not checkIfTagged(ortho_pair2):
207 orthogroup.update(orthogroup | ortho_pair2)
208 tagGroup(ortho_pair2)
209
210 # Check if subgroup is already computed
211 if len(list_orthogroups) > 0:
212 presence = False
213 for group in list_orthogroups:
214 if len(group.intersection(orthogroup)) != 0:
215 group.update(group | orthogroup)
216 presence = True
217 if not presence:
218 list_orthogroups.append(orthogroup)
219 else:
220 list_orthogroups.append(orthogroup)
221
222 # Options --------------------------------------------------------------------------------------------------
223
224 """ nb : I could try to implement a more complex code which does in the same previous loop all the following lines, to avoid multiples parsing of
225 the orthogroups list, but the code would become hardly readable. Since the whole program is already quite fast, I chosed code simplicity
226 over code efficiency """
227
228 # Print summary table with all paralogs
229 if verbose :
230 frame = countings(list_orthogroups, nb_rbh)
231 df = asFrame(frame)
232 print "\n Summary before paralogous filtering : \n"
233 print df.loc[df.ne(0).any(1),df.ne(0).any()], "\n" # Don't display columns and lines filled with 0
234
235 # Write outputFile with all the paralogous
236 if paralogs:
237 print "Writing orthogroups with paralogs files ...\n"
238 j = 1
239 for group in list_orthogroups:
240 if len(group) >= minspec:
241 writeOutputFile(group, j, True)
242 j += 1
243
244 # Paralogs filtering and summary ----------------------------------------------------------------------------
245
246 print "Filtering paralogous sequences and writing final orthogroups files ..."
247 print " (Dropping Orthogroups with less than {} species)".format(minspec)
248
249 # writeOutputFile() is called in filterParalogs()
250 list_orthogroups_format = filterParalogs(list_orthogroups, minspec)
251
252 frame = countings(list_orthogroups_format, nb_rbh)
253 df = asFrame(frame)
254 print "\n Summary after paralogous filtering : \n"
255 print df.loc[df.ne(0).any(1),df.ne(0).any()]
256
257 #return only the length of the list (at this point the program doesn't need more)
258 return len(list_orthogroups_format)
259
260 def main():
261 parser = argparse.ArgumentParser()
262 parser.add_argument("files", help="Input files separated by commas. Each file contains all the reciprocical best hits between a pair of species")
263 parser.add_argument("minspec", help="Only keep Orthogroups with at least this number of species", type=int)
264 parser.add_argument("-v", "--verbose", action="store_true", help="A supplemental summary table of orthogroups before paralogs filtering will be returned")
265 parser.add_argument("-p", "--paralogs", action="store_true", help="Proceeds to write orthogroups also before paralogous filtering")
266 args = parser.parse_args()
267
268 print "*** pogs.py ***"
269 print "\nBuilding of orthogroups based on pairs of genes obtained by pairwise comparisons between pairs of species."
270 print "Genes are gathered in orthogroups based on the principle of transitivity between genes pairs."
271
272 os.system("mkdir outputs")
273 if args.paralogs: os.system("mkdir outputs_withParalogs")
274 infiles = args.files
275 listPairwiseFiles = str.split(infiles, ",")
276 print "\nParsing input files ..."
277 list_Locus = getListPairwiseAll(listPairwiseFiles)
278 print "Creating Orthogroups ..."
279 nb_orthogroups = makeOrthogroups(list_Locus, args.minspec, len(listPairwiseFiles), args.verbose, args.paralogs)
280 print "\n{} orthogroups have been infered from {} pairwise comparisons by RBH\n".format(nb_orthogroups, len(listPairwiseFiles))
281
282 if __name__ == "__main__":
283 main()