Mercurial > repos > iuc > deg_annotate
comparison deg_annotate.py @ 0:b42373cddb77 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
| author | iuc |
|---|---|
| date | Fri, 23 Nov 2018 01:59:47 -0500 |
| parents | |
| children | e98d4ab5b5bc |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:b42373cddb77 |
|---|---|
| 1 import argparse | |
| 2 import os | |
| 3 from collections import defaultdict | |
| 4 | |
| 5 from BCBio import GFF | |
| 6 | |
| 7 | |
| 8 def strandardize(strand): | |
| 9 if str(strand) == '-1': | |
| 10 strand = '-' | |
| 11 elif str(strand) == '1': | |
| 12 strand = '+' | |
| 13 return strand | |
| 14 | |
| 15 | |
| 16 def gff_to_dict(f_gff, feat_type, idattr, txattr, attributes, input_type): | |
| 17 """ | |
| 18 It reads only exonic features because not all GFF files contain gene and trascript features. From the exonic | |
| 19 features it extracts gene names, biotypes, start and end positions. If any of these attributes do not exit | |
| 20 then they are set to NA. | |
| 21 """ | |
| 22 annotation = defaultdict(lambda: defaultdict(lambda: 'NA')) | |
| 23 exon_pos = defaultdict(lambda: defaultdict(lambda: defaultdict(int))) | |
| 24 tx_info = defaultdict(lambda: defaultdict(str)) | |
| 25 | |
| 26 with open(f_gff) as gff_handle: | |
| 27 for rec in GFF.parse(gff_handle, limit_info=dict(gff_type=[feat_type]), target_lines=1): | |
| 28 for sub_feature in rec.features: | |
| 29 start = sub_feature.location.start | |
| 30 end = sub_feature.location.end | |
| 31 strand = strandardize(sub_feature.location.strand) | |
| 32 try: | |
| 33 geneid = sub_feature.qualifiers[idattr][0] | |
| 34 except KeyError: | |
| 35 print("No '" + idattr + "' attribute found for the feature at position " | |
| 36 + rec.id + ":" + str(start) + ":" + str(end) + ". Please check your GTF/GFF file.") | |
| 37 continue | |
| 38 | |
| 39 annotation[geneid]['chr'] = rec.id | |
| 40 annotation[geneid]['strand'] = strand | |
| 41 if annotation[geneid]['start'] == 'NA' or start <= int(annotation[geneid]['start']): | |
| 42 annotation[geneid]['start'] = start | |
| 43 if annotation[geneid]['end'] == 'NA' or end >= int(annotation[geneid]['end']): | |
| 44 annotation[geneid]['end'] = end | |
| 45 | |
| 46 for attr in attributes: | |
| 47 if attr in annotation[geneid]: | |
| 48 continue | |
| 49 try: | |
| 50 annotation[geneid][attr] = sub_feature.qualifiers[attr][0] | |
| 51 except KeyError: | |
| 52 annotation[geneid][attr] = 'NA' | |
| 53 # extract exon information only in case of dexseq output | |
| 54 if input_type != "dexseq": | |
| 55 continue | |
| 56 try: | |
| 57 txid = sub_feature.qualifiers[txattr][0] | |
| 58 tx_info[txid]['chr'] = rec.id | |
| 59 tx_info[txid]['strand'] = strand | |
| 60 exon_pos[txid][int(start)][int(end)] = 1 | |
| 61 except KeyError: | |
| 62 print("No '" + txattr + "' attribute found for the feature at position " + rec.id + ":" + str( | |
| 63 start) + ":" + str(end) + ". Please check your GTF/GFF file.") | |
| 64 pass | |
| 65 | |
| 66 bed_entries = [] | |
| 67 # create BED lines only for deseq output | |
| 68 if input_type == "dexseq": | |
| 69 for txid in exon_pos.keys(): | |
| 70 starts = sorted(exon_pos[txid]) | |
| 71 strand = tx_info[txid]['strand'] | |
| 72 if strand == '-': | |
| 73 starts = reversed(starts) | |
| 74 for c, start in enumerate(starts, 1): | |
| 75 ends = sorted(exon_pos[txid][start]) | |
| 76 if strand == '-': | |
| 77 ends = reversed(ends) | |
| 78 for end in ends: | |
| 79 bed_entries.append('\t'.join([tx_info[txid]['chr'], str(start), str(end), | |
| 80 txid + ':' + str(c), '0', strand])) | |
| 81 | |
| 82 return annotation, bed_entries | |
| 83 | |
| 84 | |
| 85 def main(): | |
| 86 parser = argparse.ArgumentParser(description='Annotate DESeq2/DEXSeq tables with information from GFF/GTF files') | |
| 87 parser.add_argument('-in', '--input', required=True, | |
| 88 help='DESeq2/DEXSeq output. It is allowed to have extra information, ' | |
| 89 'but make sure that the original output columns are not altered') | |
| 90 parser.add_argument('-m', '--mode', required=True, choices=["deseq2", "dexseq"], default='deseq2', | |
| 91 help='Input file type') | |
| 92 parser.add_argument('-g', '--gff', required=True, help='The same annotation GFF/GTF file used for couting') | |
| 93 parser.add_argument('-t', '--type', default='exon', required=False, | |
| 94 help='feature type (3rd column in GFF file) to be used (default: exon)') | |
| 95 parser.add_argument('-i', '--idattr', default='gene_id', required=False, | |
| 96 help='GFF attribute to be used as feature ID. ' | |
| 97 'This should match the first column of DESeq2 output(default: geneid)') | |
| 98 parser.add_argument('-x', '--txattr', default='transcript_id', required=False, | |
| 99 help='GFF attribute to be used as transcript ID. Used for DEXSeq output only.' | |
| 100 'This should match the first column of DESeq2 output(default: transcript_id)') | |
| 101 parser.add_argument('-a', '--attributes', default='gene_biotype, gene_name', required=False, | |
| 102 help='Comma separated attributes to include in output. Default: gene_biotype, gene_name') | |
| 103 parser.add_argument('-o', '--output', required=True, help='Output file') | |
| 104 args = parser.parse_args() | |
| 105 | |
| 106 print("DE(X)Seq output file : %s" % args.input) | |
| 107 print("Input file type : %s" % args.mode) | |
| 108 print("Annotation file : %s" % args.gff) | |
| 109 print("Feature type : %s" % args.type) | |
| 110 print("ID attribute : %s" % args.idattr) | |
| 111 print("Transcript attribute : %s" % args.txattr) | |
| 112 print("Attributes to include : %s" % args.attributes) | |
| 113 print("Annotated output file : %s" % args.output) | |
| 114 | |
| 115 attr = [x.strip() for x in args.attributes.split(',')] | |
| 116 annotation, bed_entries = gff_to_dict(args.gff, args.type, args.idattr, args.txattr, attr, args.mode) | |
| 117 | |
| 118 d_binexon = {} | |
| 119 skip_exon_annotation = False | |
| 120 | |
| 121 if args.mode == "dexseq": | |
| 122 with open(args.input) as fh_input, open("input.bed", "w") as fh_input_bed: | |
| 123 for line in fh_input: | |
| 124 f = line.split('\t') | |
| 125 fh_input_bed.write('\t'.join([f[11], f[12], f[13], f[0], "0", f[15]]) + "\n") | |
| 126 | |
| 127 if len(bed_entries) == 0 and args.mode == "dexseq": | |
| 128 print("It seems there are no transcript ids present in GFF file. Skipping exon annotation.") | |
| 129 skip_exon_annotation = True | |
| 130 | |
| 131 if not skip_exon_annotation: | |
| 132 with open("annotation.bed", "w") as fh_annotation_bed: | |
| 133 for line in bed_entries: | |
| 134 fh_annotation_bed.write(line + "\n") | |
| 135 | |
| 136 # interset the DEXseq couting bins with exons in the GFF file | |
| 137 # overlaped positions can be later used to infer which bin corresponds to which exon | |
| 138 os.system("intersectBed -wo -s -a input.bed -b annotation.bed > overlap.txt") | |
| 139 | |
| 140 with open("overlap.txt") as fh_overlap: | |
| 141 for line in fh_overlap: | |
| 142 binid = line.split('\t')[3] | |
| 143 exonid = line.split('\t')[9] | |
| 144 d_binexon.setdefault(binid, []).append(exonid) | |
| 145 | |
| 146 with open(args.input) as fh_input, open(args.output, 'w') as fh_output: | |
| 147 for line in fh_input: | |
| 148 annot = [] | |
| 149 # Append the extra information from GFF to DESeq2 output | |
| 150 if args.mode == "deseq2": | |
| 151 geneid = line.split('\t')[0] | |
| 152 annot = [str(annotation[geneid]['chr']), | |
| 153 str(annotation[geneid]['start']), | |
| 154 str(annotation[geneid]['end']), | |
| 155 str(annotation[geneid]['strand'])] | |
| 156 for a in attr: | |
| 157 annot.append(annotation[geneid][a]) | |
| 158 # DEXSeq exonic bins might originate from aggrigating multiple genes. They are are separated by '+' | |
| 159 # Append the attributes from the GFF but keep the order of the aggregated genes and use '+' | |
| 160 # Aappend the transcript id and exon number from the annotation that correspond to the DEXseq counting bins | |
| 161 elif args.mode == "dexseq": | |
| 162 geneids = line.split('\t')[1].split('+') | |
| 163 for a in attr: | |
| 164 tmp = [] | |
| 165 for geneid in geneids: | |
| 166 tmp.append(str(annotation[geneid][a])) | |
| 167 annot.append('+'.join(tmp)) | |
| 168 if not skip_exon_annotation: | |
| 169 binid = line.split('\t')[0] | |
| 170 try: | |
| 171 annot.append(','.join(sorted(set(d_binexon[binid])))) | |
| 172 except KeyError: | |
| 173 annot.append('NA') | |
| 174 fh_output.write(line.rstrip('\n') + '\t' + '\t'.join(annot) + '\n') | |
| 175 | |
| 176 | |
| 177 if __name__ == "__main__": | |
| 178 main() |
