comparison GFFalign.py @ 0:294f5ba28746 draft

"planemo upload for repository https://github.com/abims-sbr/tools-abims/tools/gffalign commit d8aa0e49353e78e5fd772498a1fcf591e2744f99"
author abims-sbr
date Mon, 30 Nov 2020 18:20:46 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:294f5ba28746
1 #!/usr/bin/python3
2
3 import argparse
4 import os
5 import gffutils
6 from Bio import SeqIO
7
8
9 class GeneComp:
10 # This cass aims to discover the position of the genes in the alignment
11 # and to extract informations on the genes characteristics
12
13 def __init__(self, ccgene, qstart, qend, otgene, dstart, dend, extract, tol, gattr, output=None):
14 # d* / ot* are the db result, q* / cc the query
15 # gene are the gene from maf-tab file
16 # start, end are the coordinates '''
17 self.otbeg = int(otgene.start) - dstart
18 self.otend = int(otgene.end) - dstart
19 self.ccbeg = int(ccgene.start) - qstart
20 self.ccend = int(ccgene.end) - qstart
21 self.tol = tol # how much tolerance in nucleotides
22 self.ccname = ccgene.chrom
23 self.otname = otgene.chrom
24 self.q_outstart = qstart+self.otbeg
25 self.q_outend = qstart+self.otend
26 self.genelist = []
27 self.qstart = qstart
28 if extract:
29 self.fastafile = extract[0]
30 self.fastaout = extract[1]
31 self.output = output
32 self.out = ""
33 self.gattr = f"New_annotation='{gattr}'"
34
35 def is_equal(self):
36 if (self.otbeg - self.tol) <= self.ccbeg <= (self.otbeg + self.tol) and (self.otend - self.tol) <= self.ccend <= (self.otend + self.tol):
37 self.out = f"{self.ccname}\tprediction\tgene\t{self.q_outstart}\t{self.q_outend}\t.\t{self.qstart}\t.\tID={self.ccname};{self.gattr};Note=confirmed"
38 return self.out
39
40 def is_shorter(self):
41 if (self.otbeg + self.tol) < self.ccbeg and (self.otend - self.tol) > self.ccend:
42 self.out = f"{self.ccname}\tprediction\tgene\t{self.q_outstart}\t{self.q_outend}\t.\t{self.qstart}\t.\tID={self.ccname};{self.gattr};Note=shorter"
43 return self.out
44 elif (self.otbeg - self.tol) <= self.ccbeg <= (self.otbeg + self.tol) and (self.otend - self.tol) > self.ccend:
45 self.out = f"{self.ccname}\tprediction\tgene\t{self.q_outstart}\t{self.q_outend}\t.\t{self.qstart}\t.\tID={self.ccname};{self.gattr};Note=shorter_right"
46 return self.out
47 elif (self.otbeg + self.tol) < self.ccbeg and (self.otend - self.tol) <= self.ccend <= (self.otend + self.tol):
48 self.out = f"{self.ccname}\tprediction\tgene\t{self.q_outstart}\t{self.q_outend}\t.\t{self.qstart}\t.\tID={self.ccname};{self.gattr};Note=shorter_left"
49 return self.out
50
51 def is_longer(self):
52 if (self.otbeg - self.tol) > self.ccbeg and (self.otend + self.tol) < self.ccend:
53 self.out = f"{self.ccname}\tprediction\tgene\t{self.q_outstart}\t{self.q_outend}\t.\t{self.qstart}\t.\tID={self.ccname};{self.gattr};Note=longer"
54 return self.out
55 elif (self.otbeg - self.tol) <= self.ccbeg <= (self.otbeg + self.tol) and (self.otend + self.tol) < self.ccend:
56 self.out = f"{self.ccname}\tprediction\tgene\t{self.q_outstart}\t{self.q_outend}\t.\t{self.qstart}\t.\tID={self.ccname};{self.gattr};Note=longer_right"
57 return self.out
58 elif (self.otbeg - self.tol) > self.ccbeg and (self.otend - self.tol) <= self.ccend <= (self.otend + self.tol):
59 self.out = f"{self.ccname}\tprediction\tgene\t{self.q_outstart}\t{self.q_outend}\t.\t{self.qstart}\t.\tID={self.ccname};{self.gattr};Note=longer_left"
60 return self.out
61
62 def is_offset(self):
63 if (self.otbeg + self.tol) < self.ccbeg < (self.otend - self.tol) and (self.otend + self.tol) < self.ccend:
64 self.out = f"{self.ccname}\tprediction\tgene\t{self.q_outstart}\t{self.q_outend}\t.\t{self.qstart}\t.\tID={self.ccname};{self.gattr};Note=offset_right"
65 return self.out
66 if (self.otbeg - self.tol) > self.ccbeg and (self.otbeg - self.tol) < self.ccend < self.otend + self.tol:
67 self.out = f"{self.ccname}\tprediction\tgene\t{self.q_outstart}\t{self.q_outend}\t.\t{self.qstart}\t.\tID={self.ccname};{self.gattr};Note=offset_left"
68 return self.out
69
70 def is_different(self):
71 if self.otbeg - self.tol > self.ccend or self.otend + self.tol < self.otbeg:
72 self.out = f"{self.ccname}\tprediction\tgene\t{self.q_outstart}\t{self.q_outend}\t.\t{self.qstart}\t.\tID={self.ccname};{self.gattr};Note=new"
73 return self.out
74
75 def extract_fasta(self):
76 with open(self.fastaout,"a") as fileout:
77 with open(self.fastafile) as filefasta:
78 for record in SeqIO.parse(filefasta,"fasta"):
79 if record.id == self.ccname:
80 fileout.write(f">{self.ccname}_{self.q_outstart}:{self.q_outend}\n{record.seq[self.q_outstart:self.q_outend]}\n")
81
82 # def extract_stout(self):
83 # with open(self.fastafile) as filefasta:
84 # for record in SeqIO.parse(filefasta,"fasta"):
85 # if record.id == self.ccname:
86 # print(f">{self.ccname}_{self.q_outstart}:{self.q_outend}\n{record.seq[self.q_outstart:self.q_outend]}")
87
88
89 #def __str__(self):
90 # return f"{self.ccname}\t{self.q_outstart}\t{self.q_outend}"
91
92
93 def fout(self):
94 try:
95 if self.__class__.fout.called:
96 with open(self.output,"a") as fileout:
97 fileout.write(self.out+"\n")
98 except AttributeError:
99 try:
100 if os.path.isfile(self.output):
101 os.remove(self.output)
102 with open(self.output,"a") as fileout:
103 #fileout.write("##gff-version 3\n")
104 fileout.write(f"##gff-version 3\n{self.out}\n")
105 self.__class__.fout.called = True
106 self.__class__.fout(self)
107 except TypeError:
108 pass
109
110
111 def diff_gene(query_genes, target_genes, dstart, dend, qstart, qend, query_db):
112 # are the two genes the same?
113 if args.extract:
114 extract = args.extract
115 else:
116 extract = None
117
118 if args.tollerance:
119 tol = args.tollerance
120 else:
121 tol = 30
122
123
124 for otgene in target_genes:
125 # gattr is a variable created to store the the annotation of the target gene.
126 # It will be use to suggest a functional annotation
127 gattr = str(otgene).split("\t")[-1]
128 for ccgene in query_genes:
129 algene = GeneComp(ccgene, qstart, qend, otgene, dstart, dend, extract, tol, gattr)
130 if "new" in args.verbosity or "all" in args.verbosity:
131 if algene.is_different():
132 algene_out = algene.is_different()
133 algene_name = algene_out.split("\t")[0]
134 algene_start = int(algene_out.split("\t")[3])
135 algene_end = int(algene_out.split("\t")[4])
136 #print(al_name, al_start, al_end)
137 #print(list(target_db.region(region=(al_name, al_start, al_end), completely_within=True)))
138 if not list(query_db.region(region=(algene_name, algene_start, algene_end), completely_within=False)):
139 if args.output:
140 algene.output=args.output
141 algene.fout()
142 else:
143 print(algene.is_different())
144 if args.extract:
145 algene.extract_fasta()
146
147 if "shorter" in args.verbosity or "all" in args.verbosity:
148 if algene.is_shorter():
149 if args.output:
150 algene.output=args.output
151 algene.fout()
152 else:
153 print(algene.is_shorter())
154 if args.extract:
155 algene.extract_fasta()
156 if "longer" in args.verbosity or "all" in args.verbosity:
157 if algene.is_longer():
158 if args.output:
159 algene.output=args.output
160 algene.fout()
161 else:
162 print(algene.is_longer())
163 if args.extract:
164 algene.extract_fasta()
165 if "offset" in args.verbosity or "all" in args.verbosity:
166 if algene.is_offset():
167 if args.output:
168 algene.output=args.output
169 algene.fout()
170 else:
171 print(algene.is_offset())
172 if args.extract:
173 algene.extract_fasta()
174 if "confirmed" in args.verbosity:
175 if algene.is_equal():
176 if args.output:
177 algene.output=args.output
178 algene.fout()
179 else:
180 print(algene.is_equal())
181 if args.extract:
182 algene.extract_fasta()
183
184
185 def gff_gene_check(GffName, db_name, memory=0):
186 # The IDs on the GFFs must be unique. Moreover, because only the gene
187 # information are needed, all the other information must be removed
188 # from the GFFs.
189 tempgff=""
190 for line in open(GffName):
191 if line[0] != "#":
192 if line.split()[2] == "gene":
193 tempgff+=line
194
195 else:
196 tempgff+=line
197
198 if memory:
199 # Write the db in memory and return it as variable so it can be used
200 # as subclass of _DBCreator
201 dbout = gffutils.create_db(tempgff, ":memory:", from_string=True)
202 return dbout
203 else:
204 gffutils.create_db(tempgff, db_name, from_string=True)
205
206 def check_strand(start,leng,strand):
207 if strand == "+":
208 end = start+leng
209 else:
210 end = start-leng
211 start,end = end,start
212 return(start,end)
213
214 def check_position(line, query_db,target_db):
215 # check if there is a gene in the aligned area
216 # lets' start with the coordinatescharacteristics
217 elsp = line.split('\t')
218 dname = elsp[1]
219 dstart = int(elsp[2])
220 dlen = int(elsp[3])
221 dstrand = elsp[4]
222 qname = elsp[6]
223 qstart = int(elsp[7])
224 qlen = int(elsp[8])
225 qstrand = elsp[9]
226
227 # check the strand, if - reverse the start and the end
228 qstart,qend = check_strand(qstart,qlen,qstrand)
229 dstart,dend = check_strand(dstart,dlen,dstrand)
230
231 #counter
232 d_counter = 0
233
234 # lists of gene within the coordinates
235 target_genes = [ gene for gene in list(target_db.region(region=(dname, dstart, dend), completely_within=True))]
236 query_genes = [ gene for gene in list(query_db.region(region=(qname, qstart, qend), completely_within=True))]
237
238 if len(target_genes):
239 # and len(query_genes):
240 ######
241 # PER PROVARE HO TOLTO QUESTO, DA CONTROLLARE
242 ######
243 #if len(target_genes) >= len(query_genes): # the number of genes in the aligned area must be bigger in the target (?)
244 diff_gene(query_genes, target_genes, dstart, dend, qstart, qend, query_db)
245
246
247 if d_counter>=1:
248 return(d_counter)
249 else:
250 return(0)
251
252
253 def main(args):
254 fcounter = 0
255 # import the GFF library and create (has to be done only once)
256 # the GFF databases
257 target_db = query_db = None
258 db_query_name=args.queryGff + "_db"
259 db_target_name=args.targetGff + "_db"
260
261 if args.force_database:
262 # remove the database if required by user
263 os.remove(db_query_name)
264 os.remove(db_target_name)
265
266 if args.use_query_database:
267 # use the db passed by the user
268 query_db = gffutils.FeatureDB(args.use_query_database)
269 else:
270 #create the db
271 #gffutils.create_db(args.queryGff, db_query_name)
272 qdbout = gff_gene_check(args.queryGff, db_query_name, args.memory)
273
274 if args.memory:
275 query_db = gffutils.FeatureDB(qdbout.dbfn)
276 else:
277 query_db = gffutils.FeatureDB(db_query_name)
278 # except ValueError:
279 # print("Please check your GFF file")
280 # except:
281 # raise Exception(f"It seems you already have a db called {db_query_name}. Use the -qu if you want to use it or delete the")
282
283
284 if args.use_target_database:
285 target_db = gffutils.FeatureDB(args.use_target_database)
286 else:
287 tdbout = gff_gene_check(args.targetGff, db_target_name, args.memory)
288 if args.memory:
289 target_db = gffutils.FeatureDB(tdbout.dbfn)
290 else:
291 target_db = gffutils.FeatureDB(db_target_name)
292 # except ValueError:
293 # print("Please check your GFF file")if "all" in args.verbosity:
294 # except:
295 # raise Exception(f"It seems you already have a db called {db_target_name}. Use the -du if you want to use it or delete the")
296
297 # put the content of the DB in objects
298 with open(args.aln) as maftab:
299 for line in maftab:
300 if line[0] != "#":
301 fcounter += check_position(line, query_db, target_db)
302
303 if __name__ == '__main__':
304 parser = argparse.ArgumentParser(description = '''Tool to extract genes coordinates from a whole genome alignent.
305 This script needs an alignement in TAB format and two gff files''')
306 parser.add_argument('aln', help='alignment file in TAB format. The suggested way to obtain it is to run Last and\
307 than convert the file from MAF to TAB with maf-convert')
308 parser.add_argument('queryGff',
309 help='''Gff file of the query organism. The gene IDs in the GFF must be unique.''')
310 parser.add_argument('targetGff',
311 help='''Gff file of the "target" organism. The gene IDs in the GFF must be unique.''')
312 parser.add_argument("-uq", "--use-query-database",
313 help='''Use this parament if you already have a query gffutils formatted database or
314 if it\'s not the first time you run this script''', type=str)
315 parser.add_argument("-ut", "--use-target-database",
316 help='''Use this parament if you already have a target gffutils formatted database or
317 if it\'s not the first time you run this script''', type=str)
318 parser.add_argument("-fd", "--force-database", help="delete old gffutils databases and create new ones", action='store_true')
319 parser.add_argument("-m", "--memory", help='''create an in-memory database. This option can't be used with the other DB options.
320 probably usefull in Galaxy integration''', action='store_true' )
321 parser.add_argument("-e", "--extract",
322 help='''Extract the fasta sequence of the new suggested gene. It takes two argument: the fasta file
323 of the genome and the name of the output file. This will slow down the process A LOT.''', nargs=2, type=str)
324 #parser.add_argument("-es", "--extract_stout", help='Like -e but it will print the result in the standard output. FASTER than -e. It need the fasta file', type=str)
325 parser.add_argument("-o", "--output", help='Name of the output file. Default output is stout', type=str)
326 parser.add_argument("-t", "--tollerance", help='Interval, in nucleotide, within a gene is considered in the same position. Default 30', default=30, type=int)
327 parser.add_argument("-v", "--verbosity",
328 help='''Output options. If not specify the software shows only the genes that are in the exact position of the
329 genes in the target. It\'s possible to show annotated genes that are in aligned regions but that have different lengths or in slightly
330 different positions. It's possible to select multiple, space separated, values.''',
331 choices=["all","shorter", "longer", "offset", "new", "confirmed"], nargs='*', default='new', type=str)
332 #metavar=('all','shorter','longer','shifted','new'),
333 #action='append',
334 #choices=["all","shorter", "longer", "shifted", "new"], default="new")
335 parser.add_argument('--version', action='version', version='0.1.0')
336 args = parser.parse_args()
337 main(args)