diff mass.py @ 12:d080005cffe1 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/cufflinks/cufflinks commit a0b0845a9d1b3e7ecdeacd1e606133617e3918bd"
author iuc
date Tue, 16 Jun 2020 13:00:32 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mass.py	Tue Jun 16 13:00:32 2020 -0400
@@ -0,0 +1,108 @@
+import shutil
+import sys
+import tempfile
+
+
+def parse_gff_attributes(attr_str):
+    """
+    Parses a GFF/GTF attribute string and returns a dictionary of name-value
+    pairs. The general format for a GFF3 attributes string is
+
+        name1=value1;name2=value2
+
+    The general format for a GTF attribute string is
+
+        name1 "value1" ; name2 "value2"
+
+    The general format for a GFF attribute string is a single string that
+    denotes the interval's group; in this case, method returns a dictionary
+    with a single key-value pair, and key name is 'group'
+    """
+    attributes_list = attr_str.split(";")
+    attributes = {}
+    for name_value_pair in attributes_list:
+        # Try splitting by '=' (GFF3) first because spaces are allowed in GFF3
+        # attribute; next, try double quotes for GTF.
+        pair = name_value_pair.strip().split("=")
+        if len(pair) == 1:
+            pair = name_value_pair.strip().split("\"")
+        if len(pair) == 1:
+            # Could not split for some reason -- raise exception?
+            continue
+        if pair == '':
+            continue
+        name = pair[0].strip()
+        if name == '':
+            continue
+        # Need to strip double quote from values
+        value = pair[1].strip(" \"")
+        attributes[name] = value
+
+    if len(attributes) == 0:
+        # Could not split attributes string, so entire string must be
+        # 'group' attribute. This is the case for strictly GFF files.
+        attributes['group'] = attr_str
+    return attributes
+
+
+def gff_attributes_to_str(attrs, gff_format):
+    """
+    Convert GFF attributes to string. Supported formats are GFF3, GTF.
+    """
+    if gff_format == 'GTF':
+        format_string = '%s "%s"'
+        # Convert group (GFF) and ID, parent (GFF3) attributes to transcript_id, gene_id
+        id_attr = None
+        if 'group' in attrs:
+            id_attr = 'group'
+        elif 'ID' in attrs:
+            id_attr = 'ID'
+        elif 'Parent' in attrs:
+            id_attr = 'Parent'
+        if id_attr:
+            attrs['transcript_id'] = attrs['gene_id'] = attrs[id_attr]
+    elif gff_format == 'GFF3':
+        format_string = '%s=%s'
+    attrs_strs = []
+    for name, value in attrs.items():
+        attrs_strs.append(format_string % (name, value))
+    return " ; ".join(attrs_strs)
+
+
+stderr = sys.argv[1]
+global_model_file_name = sys.argv[2]
+transcripts = sys.argv[3]
+
+# Read standard error to get total map/upper quartile mass.
+total_map_mass = -1
+with open(stderr, 'r') as tmp_stderr2:
+    for line in tmp_stderr2:
+        if line.lower().find("map mass") >= 0 or line.lower().find("upper quartile") >= 0:
+            total_map_mass = float(line.split(":")[1].strip())
+            break
+
+if global_model_file_name != "None":
+    # Global model is simply total map mass from original run.
+    with open(global_model_file_name, 'r') as global_model_file:
+        global_model_total_map_mass = float(global_model_file.readline())
+
+    # Ratio of global model's total map mass to original run's map mass is
+    # factor used to adjust FPKM.
+    fpkm_map_mass_ratio = total_map_mass / global_model_total_map_mass
+
+    # Update FPKM values in transcripts.gtf file.
+    with open(transcripts, 'r') as transcripts_file:
+        with tempfile.NamedTemporaryFile(dir=".", delete=False) as new_transcripts_file:
+            for line in transcripts_file:
+                fields = line.split('\t')
+                attrs = parse_gff_attributes(fields[8])
+                attrs["FPKM"] = str(float(attrs["FPKM"]) * fpkm_map_mass_ratio)
+                attrs["conf_lo"] = str(float(attrs["conf_lo"]) * fpkm_map_mass_ratio)
+                attrs["conf_hi"] = str(float(attrs["conf_hi"]) * fpkm_map_mass_ratio)
+                fields[8] = gff_attributes_to_str(attrs, "GTF")
+                new_transcripts_file.write("%s\n" % '\t'.join(fields))
+    shutil.move(new_transcripts_file.name, transcripts)
+
+if total_map_mass > -1:
+    with open("global_model.txt", 'w') as f:
+        f.write("%f\n" % total_map_mass)