comparison mature_mir_gff_translation.py @ 3:6b8adacd4750 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mircounts commit fa65a844f9041a83767f5305ab360abfdf68f59f
author artbio
date Wed, 26 Jul 2017 19:15:08 -0400
parents da29af78a960
children 9ea96a02c416
comparison
equal deleted inserted replaced
2:f59c643b00fc 3:6b8adacd4750
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 2
3 import sys
4 import argparse 3 import argparse
5 4
6 5
7 def Parser(): 6 def Parser():
8 the_parser = argparse.ArgumentParser() 7 the_parser = argparse.ArgumentParser()
9 the_parser.add_argument( 8 the_parser.add_argument(
10 '--input', action="store", type=str, help="input miRBase GFF3 file") 9 '--input', action="store", type=str, help="input miRBase GFF3 file")
11 the_parser.add_argument( 10 the_parser.add_argument(
12 '--output', action="store", type=str, help="output GFF3 file with converted mature mir coordinates") 11 '--output', action="store", type=str,
12 help="output GFF3 file with converted mature mir coordinates")
13 args = the_parser.parse_args() 13 args = the_parser.parse_args()
14 return args 14 return args
15 15
16 GFF3_header= '''##gff-version 3 16
17 GFF3_header = '''##gff-version 3
17 ##generated by mature_mir_gff_translation.py 18 ##generated by mature_mir_gff_translation.py
18 # 19 #
19 # Chromosomal coordinates of microRNAs ** relative to the hairpin precursors ** 20 # Chromosomal coordinates of microRNAs ** relative to the hairpin precursors **
20 # microRNAs: miRBase current_version 21 # microRNAs: miRBase current_version
21 # genome-build-id: check http://mirbase.org/ 22 # genome-build-id: check http://mirbase.org/
22 # 23 #
23 # Hairpin precursor sequences have type "miRNA_primary_transcript". 24 # Hairpin precursor sequences have type "miRNA_primary_transcript".
24 # Note, these sequences do not represent the full primary transcript, 25 # Note, these sequences do not represent the full primary transcript,
25 # rather a predicted stem-loop portion that includes the precursor 26 # rather a predicted stem-loop portion that includes the precursor
26 # miRNA. Mature sequences have type "miRNA". 27 # miRNA. Mature sequences have type "miRNA".
27 # 28 #
28 ''' 29 '''
30
29 31
30 def load_gff_in_dict(gff_input_file): 32 def load_gff_in_dict(gff_input_file):
31 ''' 33 '''
32 Reads the gff3 file and return a dictionary of dictionaries 34 Reads the gff3 file and return a dictionary of dictionaries
33 with keys equal to standard gff3 fields (9) 35 with keys equal to standard gff3 fields (9)
34 Note that the key of the primary dictionary is the ID 36 Note that the key of the primary dictionary is the ID
35 ''' 37 '''
36 gff_dict = {} 38 gff_dict = {}
37 for line in open(gff_input_file, "r"): 39 for line in open(gff_input_file, "r"):
38 if line[0]=="#": 40 if line[0] == "#":
39 continue 41 continue
40 gff_fields=line[:-1].split("\t") 42 gff_fields = line[:-1].split("\t")
41 ID=gff_fields[8].split("ID=")[1].split(";")[0] 43 ID = gff_fields[8].split("ID=")[1].split(";")[0]
42 gff_dict[ID] = {} 44 gff_dict[ID] = {}
43 gff_dict[ID]["seqid"]=gff_fields[0] 45 gff_dict[ID]["seqid"] = gff_fields[0]
44 gff_dict[ID]["source"]=gff_fields[1] 46 gff_dict[ID]["source"] = gff_fields[1]
45 gff_dict[ID]["type"]=gff_fields[2] 47 gff_dict[ID]["type"] = gff_fields[2]
46 gff_dict[ID]["start"]=gff_fields[3] 48 gff_dict[ID]["start"] = gff_fields[3]
47 gff_dict[ID]["end"]=gff_fields[4] 49 gff_dict[ID]["end"] = gff_fields[4]
48 gff_dict[ID]["score"]=gff_fields[5] 50 gff_dict[ID]["score"] = gff_fields[5]
49 gff_dict[ID]["strand"]=gff_fields[6] 51 gff_dict[ID]["strand"] = gff_fields[6]
50 gff_dict[ID]["phase"]=gff_fields[7] 52 gff_dict[ID]["phase"] = gff_fields[7]
51 gff_dict[ID]["attributes"]=gff_fields[8] 53 gff_dict[ID]["attributes"] = gff_fields[8]
52 if "Derives_from" in gff_dict[ID]["attributes"]: 54 if "Derives_from" in gff_dict[ID]["attributes"]:
53 parent_primary_transcript=gff_dict[ID]["attributes"].split("Derives_from=")[1] 55 parent_primary_transcript = gff_dict[ID]["attributes"].split(
54 parent_primary_transcript=gff_dict[parent_primary_transcript]["attributes"].split("Name=")[1] 56 "Derives_from=")[1]
55 gff_dict[ID]["attributes"]="%s;Parent_mir_Name=%s" % (gff_dict[ID]["attributes"], parent_primary_transcript) 57 parent_primary_transcript = gff_dict[parent_primary_transcript][
58 "attributes"].split("Name=")[1]
59 gff_dict[ID]["attributes"] = "%s;Parent_mir_Name=%s" % (
60 gff_dict[ID]["attributes"], parent_primary_transcript)
56 return gff_dict 61 return gff_dict
57 62
58 63
59 def genome_to_mir_gff(gff_dict, output): 64 def genome_to_mir_gff(gff_dict, output):
60 ''' 65 '''
61 Converts seqid field from chromosome to item Name 66 Converts seqid field from chromosome to item Name
62 Then converts coordinates relative to "miRNA_primary_transcript" 67 Then converts coordinates relative to "miRNA_primary_transcript"
63 Note that GFF files are 1-based coordinates 68 Note that GFF files are 1-based coordinates
64 ''' 69 '''
65 for key in gff_dict: 70 for key in gff_dict:
66 name=gff_dict[key]["attributes"].split("Name=")[1].split(";")[0] 71 name = gff_dict[key]["attributes"].split("Name=")[1].split(";")[0]
67 gff_dict[key]["seqid"]=name 72 gff_dict[key]["seqid"] = name
68 if "Derives_from=" in gff_dict[key]["attributes"]: 73 if "Derives_from=" in gff_dict[key]["attributes"]:
69 parent_ID=gff_dict[key]["attributes"].split("Derives_from=")[1].split(";")[0] 74 parent_ID = gff_dict[key]["attributes"].split(
70 gff_dict[key]["start"]=str(int(gff_dict[key]["start"]) - int(gff_dict[parent_ID]["start"]) + 1) 75 "Derives_from=")[1].split(";")[0]
71 gff_dict[key]["end"]=str(int(gff_dict[key]["end"]) - int(gff_dict[parent_ID]["start"]) + 1) 76 gff_dict[key]["start"] = str(int(gff_dict[key]["start"])-int(
72 hairpins={} 77 gff_dict[parent_ID]["start"])+1)
73 matures={} 78 gff_dict[key]["end"] = str(int(gff_dict[key]["end"])-int(
74 for key in gff_dict: ## treats miRNA_primary_transcript coordinates in a second loop to avoid errors in conversion 79 gff_dict[parent_ID]["start"])+1)
75 if gff_dict[key]["type"]=="miRNA_primary_transcript": 80 hairpins = {}
76 gff_dict[key]["end"]=str(int(gff_dict[key]["end"]) - int(gff_dict[key]["start"]) + 1) 81 matures = {}
77 gff_dict[key]["start"]="1" 82 # treats miRNA_primary_transcript coordinates
83 # in a second loop to avoid errors in conversion
84 for key in gff_dict:
85 if gff_dict[key]["type"] == "miRNA_primary_transcript":
86 gff_dict[key]["end"] = str(int(gff_dict[key]["end"])-int(
87 gff_dict[key]["start"]) + 1)
88 gff_dict[key]["start"] = '1'
78 # now, do a dict[ID]=Name but only for miRNA_primary_transcript 89 # now, do a dict[ID]=Name but only for miRNA_primary_transcript
79 hairpins[key]=gff_dict[key]["attributes"].split("Name=")[1].split(";")[0] 90 hairpins[key] = gff_dict[key]["attributes"].split(
91 "Name=")[1].split(
92 ";")[0]
80 else: 93 else:
81 matures[key]=gff_dict[key]["attributes"].split("Name=")[1].split(";")[0] 94 matures[key] = gff_dict[key]["attributes"].split(
95 "Name=")[1].split(
96 ";")[0]
82 with open(output, "w") as output: 97 with open(output, "w") as output:
83 output.write(GFF3_header) 98 output.write(GFF3_header)
84 for ID in sorted(hairpins, key=hairpins.get): 99 for ID in sorted(hairpins, key=hairpins.get):
85 output.write("\t".join([gff_dict[ID]["seqid"], gff_dict[ID]["source"], 100 output.write("\t".join([gff_dict[ID]["seqid"],
86 gff_dict[ID]["type"], gff_dict[ID]["start"], gff_dict[ID]["end"], 101 gff_dict[ID]["source"], gff_dict[ID]["type"],
87 gff_dict[ID]["score"], gff_dict[ID]["strand"], gff_dict[ID]["phase"], 102 gff_dict[ID]["start"], gff_dict[ID]["end"],
88 gff_dict[ID]["attributes"]])) 103 gff_dict[ID]["score"], gff_dict[ID]["strand"],
104 gff_dict[ID]["phase"], gff_dict[ID]["attributes"]]))
89 output.write("\n") 105 output.write("\n")
90 for id in sorted(matures, key=matures.get, reverse=True): 106 for id in sorted(matures, key=matures.get, reverse=True):
91 if ID in gff_dict[id]["attributes"]: 107 if ID in gff_dict[id]["attributes"]:
92 output.write("\t".join([gff_dict[id]["seqid"], gff_dict[id]["source"], 108 output.write("\t".join([gff_dict[id]["seqid"],
93 gff_dict[id]["type"], gff_dict[id]["start"], gff_dict[id]["end"], 109 gff_dict[id]["source"], gff_dict[id]["type"],
94 gff_dict[id]["score"], gff_dict[id]["strand"], 110 gff_dict[id]["start"], gff_dict[id]["end"],
95 gff_dict[id]["phase"], gff_dict[id]["attributes"]])) 111 gff_dict[id]["score"],
112 gff_dict[id]["strand"],
113 gff_dict[id]["phase"],
114 gff_dict[id]["attributes"]]))
96 output.write("\n") 115 output.write("\n")
97 116
98 117
99 def main(infile, outfile): 118 def main(infile, outfile):
100 gff_dict = load_gff_in_dict(infile) 119 gff_dict = load_gff_in_dict(infile)