comparison mass.py @ 12:d080005cffe1 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/cufflinks/cufflinks commit a0b0845a9d1b3e7ecdeacd1e606133617e3918bd"
author iuc
date Tue, 16 Jun 2020 13:00:32 -0400
parents
children
comparison
equal deleted inserted replaced
11:e04dbae2abe0 12:d080005cffe1
1 import shutil
2 import sys
3 import tempfile
4
5
6 def parse_gff_attributes(attr_str):
7 """
8 Parses a GFF/GTF attribute string and returns a dictionary of name-value
9 pairs. The general format for a GFF3 attributes string is
10
11 name1=value1;name2=value2
12
13 The general format for a GTF attribute string is
14
15 name1 "value1" ; name2 "value2"
16
17 The general format for a GFF attribute string is a single string that
18 denotes the interval's group; in this case, method returns a dictionary
19 with a single key-value pair, and key name is 'group'
20 """
21 attributes_list = attr_str.split(";")
22 attributes = {}
23 for name_value_pair in attributes_list:
24 # Try splitting by '=' (GFF3) first because spaces are allowed in GFF3
25 # attribute; next, try double quotes for GTF.
26 pair = name_value_pair.strip().split("=")
27 if len(pair) == 1:
28 pair = name_value_pair.strip().split("\"")
29 if len(pair) == 1:
30 # Could not split for some reason -- raise exception?
31 continue
32 if pair == '':
33 continue
34 name = pair[0].strip()
35 if name == '':
36 continue
37 # Need to strip double quote from values
38 value = pair[1].strip(" \"")
39 attributes[name] = value
40
41 if len(attributes) == 0:
42 # Could not split attributes string, so entire string must be
43 # 'group' attribute. This is the case for strictly GFF files.
44 attributes['group'] = attr_str
45 return attributes
46
47
48 def gff_attributes_to_str(attrs, gff_format):
49 """
50 Convert GFF attributes to string. Supported formats are GFF3, GTF.
51 """
52 if gff_format == 'GTF':
53 format_string = '%s "%s"'
54 # Convert group (GFF) and ID, parent (GFF3) attributes to transcript_id, gene_id
55 id_attr = None
56 if 'group' in attrs:
57 id_attr = 'group'
58 elif 'ID' in attrs:
59 id_attr = 'ID'
60 elif 'Parent' in attrs:
61 id_attr = 'Parent'
62 if id_attr:
63 attrs['transcript_id'] = attrs['gene_id'] = attrs[id_attr]
64 elif gff_format == 'GFF3':
65 format_string = '%s=%s'
66 attrs_strs = []
67 for name, value in attrs.items():
68 attrs_strs.append(format_string % (name, value))
69 return " ; ".join(attrs_strs)
70
71
72 stderr = sys.argv[1]
73 global_model_file_name = sys.argv[2]
74 transcripts = sys.argv[3]
75
76 # Read standard error to get total map/upper quartile mass.
77 total_map_mass = -1
78 with open(stderr, 'r') as tmp_stderr2:
79 for line in tmp_stderr2:
80 if line.lower().find("map mass") >= 0 or line.lower().find("upper quartile") >= 0:
81 total_map_mass = float(line.split(":")[1].strip())
82 break
83
84 if global_model_file_name != "None":
85 # Global model is simply total map mass from original run.
86 with open(global_model_file_name, 'r') as global_model_file:
87 global_model_total_map_mass = float(global_model_file.readline())
88
89 # Ratio of global model's total map mass to original run's map mass is
90 # factor used to adjust FPKM.
91 fpkm_map_mass_ratio = total_map_mass / global_model_total_map_mass
92
93 # Update FPKM values in transcripts.gtf file.
94 with open(transcripts, 'r') as transcripts_file:
95 with tempfile.NamedTemporaryFile(dir=".", delete=False) as new_transcripts_file:
96 for line in transcripts_file:
97 fields = line.split('\t')
98 attrs = parse_gff_attributes(fields[8])
99 attrs["FPKM"] = str(float(attrs["FPKM"]) * fpkm_map_mass_ratio)
100 attrs["conf_lo"] = str(float(attrs["conf_lo"]) * fpkm_map_mass_ratio)
101 attrs["conf_hi"] = str(float(attrs["conf_hi"]) * fpkm_map_mass_ratio)
102 fields[8] = gff_attributes_to_str(attrs, "GTF")
103 new_transcripts_file.write("%s\n" % '\t'.join(fields))
104 shutil.move(new_transcripts_file.name, transcripts)
105
106 if total_map_mass > -1:
107 with open("global_model.txt", 'w') as f:
108 f.write("%f\n" % total_map_mass)