Mercurial > repos > iuc > deg_annotate
comparison deg_annotate.py @ 0:b42373cddb77 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
author | iuc |
---|---|
date | Fri, 23 Nov 2018 01:59:47 -0500 |
parents | |
children | e98d4ab5b5bc |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:b42373cddb77 |
---|---|
1 import argparse | |
2 import os | |
3 from collections import defaultdict | |
4 | |
5 from BCBio import GFF | |
6 | |
7 | |
8 def strandardize(strand): | |
9 if str(strand) == '-1': | |
10 strand = '-' | |
11 elif str(strand) == '1': | |
12 strand = '+' | |
13 return strand | |
14 | |
15 | |
16 def gff_to_dict(f_gff, feat_type, idattr, txattr, attributes, input_type): | |
17 """ | |
18 It reads only exonic features because not all GFF files contain gene and trascript features. From the exonic | |
19 features it extracts gene names, biotypes, start and end positions. If any of these attributes do not exit | |
20 then they are set to NA. | |
21 """ | |
22 annotation = defaultdict(lambda: defaultdict(lambda: 'NA')) | |
23 exon_pos = defaultdict(lambda: defaultdict(lambda: defaultdict(int))) | |
24 tx_info = defaultdict(lambda: defaultdict(str)) | |
25 | |
26 with open(f_gff) as gff_handle: | |
27 for rec in GFF.parse(gff_handle, limit_info=dict(gff_type=[feat_type]), target_lines=1): | |
28 for sub_feature in rec.features: | |
29 start = sub_feature.location.start | |
30 end = sub_feature.location.end | |
31 strand = strandardize(sub_feature.location.strand) | |
32 try: | |
33 geneid = sub_feature.qualifiers[idattr][0] | |
34 except KeyError: | |
35 print("No '" + idattr + "' attribute found for the feature at position " | |
36 + rec.id + ":" + str(start) + ":" + str(end) + ". Please check your GTF/GFF file.") | |
37 continue | |
38 | |
39 annotation[geneid]['chr'] = rec.id | |
40 annotation[geneid]['strand'] = strand | |
41 if annotation[geneid]['start'] == 'NA' or start <= int(annotation[geneid]['start']): | |
42 annotation[geneid]['start'] = start | |
43 if annotation[geneid]['end'] == 'NA' or end >= int(annotation[geneid]['end']): | |
44 annotation[geneid]['end'] = end | |
45 | |
46 for attr in attributes: | |
47 if attr in annotation[geneid]: | |
48 continue | |
49 try: | |
50 annotation[geneid][attr] = sub_feature.qualifiers[attr][0] | |
51 except KeyError: | |
52 annotation[geneid][attr] = 'NA' | |
53 # extract exon information only in case of dexseq output | |
54 if input_type != "dexseq": | |
55 continue | |
56 try: | |
57 txid = sub_feature.qualifiers[txattr][0] | |
58 tx_info[txid]['chr'] = rec.id | |
59 tx_info[txid]['strand'] = strand | |
60 exon_pos[txid][int(start)][int(end)] = 1 | |
61 except KeyError: | |
62 print("No '" + txattr + "' attribute found for the feature at position " + rec.id + ":" + str( | |
63 start) + ":" + str(end) + ". Please check your GTF/GFF file.") | |
64 pass | |
65 | |
66 bed_entries = [] | |
67 # create BED lines only for deseq output | |
68 if input_type == "dexseq": | |
69 for txid in exon_pos.keys(): | |
70 starts = sorted(exon_pos[txid]) | |
71 strand = tx_info[txid]['strand'] | |
72 if strand == '-': | |
73 starts = reversed(starts) | |
74 for c, start in enumerate(starts, 1): | |
75 ends = sorted(exon_pos[txid][start]) | |
76 if strand == '-': | |
77 ends = reversed(ends) | |
78 for end in ends: | |
79 bed_entries.append('\t'.join([tx_info[txid]['chr'], str(start), str(end), | |
80 txid + ':' + str(c), '0', strand])) | |
81 | |
82 return annotation, bed_entries | |
83 | |
84 | |
85 def main(): | |
86 parser = argparse.ArgumentParser(description='Annotate DESeq2/DEXSeq tables with information from GFF/GTF files') | |
87 parser.add_argument('-in', '--input', required=True, | |
88 help='DESeq2/DEXSeq output. It is allowed to have extra information, ' | |
89 'but make sure that the original output columns are not altered') | |
90 parser.add_argument('-m', '--mode', required=True, choices=["deseq2", "dexseq"], default='deseq2', | |
91 help='Input file type') | |
92 parser.add_argument('-g', '--gff', required=True, help='The same annotation GFF/GTF file used for couting') | |
93 parser.add_argument('-t', '--type', default='exon', required=False, | |
94 help='feature type (3rd column in GFF file) to be used (default: exon)') | |
95 parser.add_argument('-i', '--idattr', default='gene_id', required=False, | |
96 help='GFF attribute to be used as feature ID. ' | |
97 'This should match the first column of DESeq2 output(default: geneid)') | |
98 parser.add_argument('-x', '--txattr', default='transcript_id', required=False, | |
99 help='GFF attribute to be used as transcript ID. Used for DEXSeq output only.' | |
100 'This should match the first column of DESeq2 output(default: transcript_id)') | |
101 parser.add_argument('-a', '--attributes', default='gene_biotype, gene_name', required=False, | |
102 help='Comma separated attributes to include in output. Default: gene_biotype, gene_name') | |
103 parser.add_argument('-o', '--output', required=True, help='Output file') | |
104 args = parser.parse_args() | |
105 | |
106 print("DE(X)Seq output file : %s" % args.input) | |
107 print("Input file type : %s" % args.mode) | |
108 print("Annotation file : %s" % args.gff) | |
109 print("Feature type : %s" % args.type) | |
110 print("ID attribute : %s" % args.idattr) | |
111 print("Transcript attribute : %s" % args.txattr) | |
112 print("Attributes to include : %s" % args.attributes) | |
113 print("Annotated output file : %s" % args.output) | |
114 | |
115 attr = [x.strip() for x in args.attributes.split(',')] | |
116 annotation, bed_entries = gff_to_dict(args.gff, args.type, args.idattr, args.txattr, attr, args.mode) | |
117 | |
118 d_binexon = {} | |
119 skip_exon_annotation = False | |
120 | |
121 if args.mode == "dexseq": | |
122 with open(args.input) as fh_input, open("input.bed", "w") as fh_input_bed: | |
123 for line in fh_input: | |
124 f = line.split('\t') | |
125 fh_input_bed.write('\t'.join([f[11], f[12], f[13], f[0], "0", f[15]]) + "\n") | |
126 | |
127 if len(bed_entries) == 0 and args.mode == "dexseq": | |
128 print("It seems there are no transcript ids present in GFF file. Skipping exon annotation.") | |
129 skip_exon_annotation = True | |
130 | |
131 if not skip_exon_annotation: | |
132 with open("annotation.bed", "w") as fh_annotation_bed: | |
133 for line in bed_entries: | |
134 fh_annotation_bed.write(line + "\n") | |
135 | |
136 # interset the DEXseq couting bins with exons in the GFF file | |
137 # overlaped positions can be later used to infer which bin corresponds to which exon | |
138 os.system("intersectBed -wo -s -a input.bed -b annotation.bed > overlap.txt") | |
139 | |
140 with open("overlap.txt") as fh_overlap: | |
141 for line in fh_overlap: | |
142 binid = line.split('\t')[3] | |
143 exonid = line.split('\t')[9] | |
144 d_binexon.setdefault(binid, []).append(exonid) | |
145 | |
146 with open(args.input) as fh_input, open(args.output, 'w') as fh_output: | |
147 for line in fh_input: | |
148 annot = [] | |
149 # Append the extra information from GFF to DESeq2 output | |
150 if args.mode == "deseq2": | |
151 geneid = line.split('\t')[0] | |
152 annot = [str(annotation[geneid]['chr']), | |
153 str(annotation[geneid]['start']), | |
154 str(annotation[geneid]['end']), | |
155 str(annotation[geneid]['strand'])] | |
156 for a in attr: | |
157 annot.append(annotation[geneid][a]) | |
158 # DEXSeq exonic bins might originate from aggrigating multiple genes. They are are separated by '+' | |
159 # Append the attributes from the GFF but keep the order of the aggregated genes and use '+' | |
160 # Aappend the transcript id and exon number from the annotation that correspond to the DEXseq counting bins | |
161 elif args.mode == "dexseq": | |
162 geneids = line.split('\t')[1].split('+') | |
163 for a in attr: | |
164 tmp = [] | |
165 for geneid in geneids: | |
166 tmp.append(str(annotation[geneid][a])) | |
167 annot.append('+'.join(tmp)) | |
168 if not skip_exon_annotation: | |
169 binid = line.split('\t')[0] | |
170 try: | |
171 annot.append(','.join(sorted(set(d_binexon[binid])))) | |
172 except KeyError: | |
173 annot.append('NA') | |
174 fh_output.write(line.rstrip('\n') + '\t' + '\t'.join(annot) + '\n') | |
175 | |
176 | |
177 if __name__ == "__main__": | |
178 main() |