annotate deg_annotate.py @ 0:b42373cddb77 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
author iuc
date Fri, 23 Nov 2018 01:59:47 -0500
parents
children e98d4ab5b5bc
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
1 import argparse
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
2 import os
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
3 from collections import defaultdict
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
4
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
5 from BCBio import GFF
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
6
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
7
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
8 def strandardize(strand):
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
9 if str(strand) == '-1':
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
10 strand = '-'
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
11 elif str(strand) == '1':
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
12 strand = '+'
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
13 return strand
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
14
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
15
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
16 def gff_to_dict(f_gff, feat_type, idattr, txattr, attributes, input_type):
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
17 """
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
18 It reads only exonic features because not all GFF files contain gene and trascript features. From the exonic
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
19 features it extracts gene names, biotypes, start and end positions. If any of these attributes do not exit
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
20 then they are set to NA.
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
21 """
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
22 annotation = defaultdict(lambda: defaultdict(lambda: 'NA'))
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
23 exon_pos = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
24 tx_info = defaultdict(lambda: defaultdict(str))
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
25
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
26 with open(f_gff) as gff_handle:
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
27 for rec in GFF.parse(gff_handle, limit_info=dict(gff_type=[feat_type]), target_lines=1):
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
28 for sub_feature in rec.features:
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
29 start = sub_feature.location.start
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
30 end = sub_feature.location.end
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
31 strand = strandardize(sub_feature.location.strand)
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
32 try:
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
33 geneid = sub_feature.qualifiers[idattr][0]
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
34 except KeyError:
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
35 print("No '" + idattr + "' attribute found for the feature at position "
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
36 + rec.id + ":" + str(start) + ":" + str(end) + ". Please check your GTF/GFF file.")
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
37 continue
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
38
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
39 annotation[geneid]['chr'] = rec.id
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
40 annotation[geneid]['strand'] = strand
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
41 if annotation[geneid]['start'] == 'NA' or start <= int(annotation[geneid]['start']):
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
42 annotation[geneid]['start'] = start
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
43 if annotation[geneid]['end'] == 'NA' or end >= int(annotation[geneid]['end']):
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
44 annotation[geneid]['end'] = end
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
45
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
46 for attr in attributes:
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
47 if attr in annotation[geneid]:
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
48 continue
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
49 try:
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
50 annotation[geneid][attr] = sub_feature.qualifiers[attr][0]
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
51 except KeyError:
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
52 annotation[geneid][attr] = 'NA'
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
53 # extract exon information only in case of dexseq output
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
54 if input_type != "dexseq":
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
55 continue
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
56 try:
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
57 txid = sub_feature.qualifiers[txattr][0]
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
58 tx_info[txid]['chr'] = rec.id
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
59 tx_info[txid]['strand'] = strand
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
60 exon_pos[txid][int(start)][int(end)] = 1
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
61 except KeyError:
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
62 print("No '" + txattr + "' attribute found for the feature at position " + rec.id + ":" + str(
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
63 start) + ":" + str(end) + ". Please check your GTF/GFF file.")
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
64 pass
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
65
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
66 bed_entries = []
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
67 # create BED lines only for deseq output
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
68 if input_type == "dexseq":
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
69 for txid in exon_pos.keys():
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
70 starts = sorted(exon_pos[txid])
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
71 strand = tx_info[txid]['strand']
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
72 if strand == '-':
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
73 starts = reversed(starts)
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
74 for c, start in enumerate(starts, 1):
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
75 ends = sorted(exon_pos[txid][start])
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
76 if strand == '-':
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
77 ends = reversed(ends)
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
78 for end in ends:
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
79 bed_entries.append('\t'.join([tx_info[txid]['chr'], str(start), str(end),
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
80 txid + ':' + str(c), '0', strand]))
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
81
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
82 return annotation, bed_entries
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
83
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
84
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
85 def main():
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
86 parser = argparse.ArgumentParser(description='Annotate DESeq2/DEXSeq tables with information from GFF/GTF files')
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
87 parser.add_argument('-in', '--input', required=True,
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
88 help='DESeq2/DEXSeq output. It is allowed to have extra information, '
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
89 'but make sure that the original output columns are not altered')
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
90 parser.add_argument('-m', '--mode', required=True, choices=["deseq2", "dexseq"], default='deseq2',
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
91 help='Input file type')
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
92 parser.add_argument('-g', '--gff', required=True, help='The same annotation GFF/GTF file used for couting')
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
93 parser.add_argument('-t', '--type', default='exon', required=False,
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
94 help='feature type (3rd column in GFF file) to be used (default: exon)')
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
95 parser.add_argument('-i', '--idattr', default='gene_id', required=False,
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
96 help='GFF attribute to be used as feature ID. '
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
97 'This should match the first column of DESeq2 output(default: geneid)')
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
98 parser.add_argument('-x', '--txattr', default='transcript_id', required=False,
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
99 help='GFF attribute to be used as transcript ID. Used for DEXSeq output only.'
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
100 'This should match the first column of DESeq2 output(default: transcript_id)')
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
101 parser.add_argument('-a', '--attributes', default='gene_biotype, gene_name', required=False,
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
102 help='Comma separated attributes to include in output. Default: gene_biotype, gene_name')
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
103 parser.add_argument('-o', '--output', required=True, help='Output file')
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
104 args = parser.parse_args()
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
105
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
106 print("DE(X)Seq output file : %s" % args.input)
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
107 print("Input file type : %s" % args.mode)
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
108 print("Annotation file : %s" % args.gff)
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
109 print("Feature type : %s" % args.type)
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
110 print("ID attribute : %s" % args.idattr)
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
111 print("Transcript attribute : %s" % args.txattr)
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
112 print("Attributes to include : %s" % args.attributes)
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
113 print("Annotated output file : %s" % args.output)
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
114
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
115 attr = [x.strip() for x in args.attributes.split(',')]
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
116 annotation, bed_entries = gff_to_dict(args.gff, args.type, args.idattr, args.txattr, attr, args.mode)
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
117
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
118 d_binexon = {}
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
119 skip_exon_annotation = False
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
120
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
121 if args.mode == "dexseq":
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
122 with open(args.input) as fh_input, open("input.bed", "w") as fh_input_bed:
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
123 for line in fh_input:
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
124 f = line.split('\t')
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
125 fh_input_bed.write('\t'.join([f[11], f[12], f[13], f[0], "0", f[15]]) + "\n")
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
126
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
127 if len(bed_entries) == 0 and args.mode == "dexseq":
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
128 print("It seems there are no transcript ids present in GFF file. Skipping exon annotation.")
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
129 skip_exon_annotation = True
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
130
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
131 if not skip_exon_annotation:
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
132 with open("annotation.bed", "w") as fh_annotation_bed:
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
133 for line in bed_entries:
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
134 fh_annotation_bed.write(line + "\n")
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
135
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
136 # interset the DEXseq couting bins with exons in the GFF file
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
137 # overlaped positions can be later used to infer which bin corresponds to which exon
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
138 os.system("intersectBed -wo -s -a input.bed -b annotation.bed > overlap.txt")
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
139
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
140 with open("overlap.txt") as fh_overlap:
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
141 for line in fh_overlap:
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
142 binid = line.split('\t')[3]
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
143 exonid = line.split('\t')[9]
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
144 d_binexon.setdefault(binid, []).append(exonid)
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
145
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
146 with open(args.input) as fh_input, open(args.output, 'w') as fh_output:
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
147 for line in fh_input:
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
148 annot = []
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
149 # Append the extra information from GFF to DESeq2 output
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
150 if args.mode == "deseq2":
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
151 geneid = line.split('\t')[0]
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
152 annot = [str(annotation[geneid]['chr']),
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
153 str(annotation[geneid]['start']),
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
154 str(annotation[geneid]['end']),
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
155 str(annotation[geneid]['strand'])]
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
156 for a in attr:
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
157 annot.append(annotation[geneid][a])
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
158 # DEXSeq exonic bins might originate from aggrigating multiple genes. They are are separated by '+'
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
159 # Append the attributes from the GFF but keep the order of the aggregated genes and use '+'
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
160 # Aappend the transcript id and exon number from the annotation that correspond to the DEXseq counting bins
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
161 elif args.mode == "dexseq":
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
162 geneids = line.split('\t')[1].split('+')
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
163 for a in attr:
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
164 tmp = []
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
165 for geneid in geneids:
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
166 tmp.append(str(annotation[geneid][a]))
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
167 annot.append('+'.join(tmp))
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
168 if not skip_exon_annotation:
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
169 binid = line.split('\t')[0]
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
170 try:
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
171 annot.append(','.join(sorted(set(d_binexon[binid]))))
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
172 except KeyError:
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
173 annot.append('NA')
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
174 fh_output.write(line.rstrip('\n') + '\t' + '\t'.join(annot) + '\n')
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
175
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
176
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
177 if __name__ == "__main__":
b42373cddb77 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
iuc
parents:
diff changeset
178 main()