Mercurial > repos > artbio > mircounts
comparison mature_mir_gff_translation.py @ 0:da29af78a960 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mircounts commit d4d8106d66b65679a1a685ab94bfcf99cdb7b959
author | artbio |
---|---|
date | Mon, 24 Jul 2017 06:27:50 -0400 |
parents | |
children | 6b8adacd4750 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:da29af78a960 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 import sys | |
4 import argparse | |
5 | |
6 | |
7 def Parser(): | |
8 the_parser = argparse.ArgumentParser() | |
9 the_parser.add_argument( | |
10 '--input', action="store", type=str, help="input miRBase GFF3 file") | |
11 the_parser.add_argument( | |
12 '--output', action="store", type=str, help="output GFF3 file with converted mature mir coordinates") | |
13 args = the_parser.parse_args() | |
14 return args | |
15 | |
16 GFF3_header= '''##gff-version 3 | |
17 ##generated by mature_mir_gff_translation.py | |
18 # | |
19 # Chromosomal coordinates of microRNAs ** relative to the hairpin precursors ** | |
20 # microRNAs: miRBase current_version | |
21 # genome-build-id: check http://mirbase.org/ | |
22 # | |
23 # Hairpin precursor sequences have type "miRNA_primary_transcript". | |
24 # Note, these sequences do not represent the full primary transcript, | |
25 # rather a predicted stem-loop portion that includes the precursor | |
26 # miRNA. Mature sequences have type "miRNA". | |
27 # | |
28 ''' | |
29 | |
30 def load_gff_in_dict(gff_input_file): | |
31 ''' | |
32 Reads the gff3 file and return a dictionary of dictionaries | |
33 with keys equal to standard gff3 fields (9) | |
34 Note that the key of the primary dictionary is the ID | |
35 ''' | |
36 gff_dict = {} | |
37 for line in open(gff_input_file, "r"): | |
38 if line[0]=="#": | |
39 continue | |
40 gff_fields=line[:-1].split("\t") | |
41 ID=gff_fields[8].split("ID=")[1].split(";")[0] | |
42 gff_dict[ID] = {} | |
43 gff_dict[ID]["seqid"]=gff_fields[0] | |
44 gff_dict[ID]["source"]=gff_fields[1] | |
45 gff_dict[ID]["type"]=gff_fields[2] | |
46 gff_dict[ID]["start"]=gff_fields[3] | |
47 gff_dict[ID]["end"]=gff_fields[4] | |
48 gff_dict[ID]["score"]=gff_fields[5] | |
49 gff_dict[ID]["strand"]=gff_fields[6] | |
50 gff_dict[ID]["phase"]=gff_fields[7] | |
51 gff_dict[ID]["attributes"]=gff_fields[8] | |
52 if "Derives_from" in gff_dict[ID]["attributes"]: | |
53 parent_primary_transcript=gff_dict[ID]["attributes"].split("Derives_from=")[1] | |
54 parent_primary_transcript=gff_dict[parent_primary_transcript]["attributes"].split("Name=")[1] | |
55 gff_dict[ID]["attributes"]="%s;Parent_mir_Name=%s" % (gff_dict[ID]["attributes"], parent_primary_transcript) | |
56 return gff_dict | |
57 | |
58 | |
59 def genome_to_mir_gff(gff_dict, output): | |
60 ''' | |
61 Converts seqid field from chromosome to item Name | |
62 Then converts coordinates relative to "miRNA_primary_transcript" | |
63 Note that GFF files are 1-based coordinates | |
64 ''' | |
65 for key in gff_dict: | |
66 name=gff_dict[key]["attributes"].split("Name=")[1].split(";")[0] | |
67 gff_dict[key]["seqid"]=name | |
68 if "Derives_from=" in gff_dict[key]["attributes"]: | |
69 parent_ID=gff_dict[key]["attributes"].split("Derives_from=")[1].split(";")[0] | |
70 gff_dict[key]["start"]=str(int(gff_dict[key]["start"]) - int(gff_dict[parent_ID]["start"]) + 1) | |
71 gff_dict[key]["end"]=str(int(gff_dict[key]["end"]) - int(gff_dict[parent_ID]["start"]) + 1) | |
72 hairpins={} | |
73 matures={} | |
74 for key in gff_dict: ## treats miRNA_primary_transcript coordinates in a second loop to avoid errors in conversion | |
75 if gff_dict[key]["type"]=="miRNA_primary_transcript": | |
76 gff_dict[key]["end"]=str(int(gff_dict[key]["end"]) - int(gff_dict[key]["start"]) + 1) | |
77 gff_dict[key]["start"]="1" | |
78 # now, do a dict[ID]=Name but only for miRNA_primary_transcript | |
79 hairpins[key]=gff_dict[key]["attributes"].split("Name=")[1].split(";")[0] | |
80 else: | |
81 matures[key]=gff_dict[key]["attributes"].split("Name=")[1].split(";")[0] | |
82 with open(output, "w") as output: | |
83 output.write(GFF3_header) | |
84 for ID in sorted(hairpins, key=hairpins.get): | |
85 output.write("\t".join([gff_dict[ID]["seqid"], gff_dict[ID]["source"], | |
86 gff_dict[ID]["type"], gff_dict[ID]["start"], gff_dict[ID]["end"], | |
87 gff_dict[ID]["score"], gff_dict[ID]["strand"], gff_dict[ID]["phase"], | |
88 gff_dict[ID]["attributes"]])) | |
89 output.write("\n") | |
90 for id in sorted(matures, key=matures.get, reverse=True): | |
91 if ID in gff_dict[id]["attributes"]: | |
92 output.write("\t".join([gff_dict[id]["seqid"], gff_dict[id]["source"], | |
93 gff_dict[id]["type"], gff_dict[id]["start"], gff_dict[id]["end"], | |
94 gff_dict[id]["score"], gff_dict[id]["strand"], | |
95 gff_dict[id]["phase"], gff_dict[id]["attributes"]])) | |
96 output.write("\n") | |
97 | |
98 | |
99 def main(infile, outfile): | |
100 gff_dict = load_gff_in_dict(infile) | |
101 genome_to_mir_gff(gff_dict, outfile) | |
102 | |
103 | |
104 if __name__ == "__main__": | |
105 args = Parser() | |
106 main(args.input, args.output) |