Mercurial > repos > devteam > cufflinks
diff mass.py @ 12:d080005cffe1 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/cufflinks/cufflinks commit a0b0845a9d1b3e7ecdeacd1e606133617e3918bd"
author | iuc |
---|---|
date | Tue, 16 Jun 2020 13:00:32 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mass.py Tue Jun 16 13:00:32 2020 -0400 @@ -0,0 +1,108 @@ +import shutil +import sys +import tempfile + + +def parse_gff_attributes(attr_str): + """ + Parses a GFF/GTF attribute string and returns a dictionary of name-value + pairs. The general format for a GFF3 attributes string is + + name1=value1;name2=value2 + + The general format for a GTF attribute string is + + name1 "value1" ; name2 "value2" + + The general format for a GFF attribute string is a single string that + denotes the interval's group; in this case, method returns a dictionary + with a single key-value pair, and key name is 'group' + """ + attributes_list = attr_str.split(";") + attributes = {} + for name_value_pair in attributes_list: + # Try splitting by '=' (GFF3) first because spaces are allowed in GFF3 + # attribute; next, try double quotes for GTF. + pair = name_value_pair.strip().split("=") + if len(pair) == 1: + pair = name_value_pair.strip().split("\"") + if len(pair) == 1: + # Could not split for some reason -- raise exception? + continue + if pair == '': + continue + name = pair[0].strip() + if name == '': + continue + # Need to strip double quote from values + value = pair[1].strip(" \"") + attributes[name] = value + + if len(attributes) == 0: + # Could not split attributes string, so entire string must be + # 'group' attribute. This is the case for strictly GFF files. + attributes['group'] = attr_str + return attributes + + +def gff_attributes_to_str(attrs, gff_format): + """ + Convert GFF attributes to string. Supported formats are GFF3, GTF. + """ + if gff_format == 'GTF': + format_string = '%s "%s"' + # Convert group (GFF) and ID, parent (GFF3) attributes to transcript_id, gene_id + id_attr = None + if 'group' in attrs: + id_attr = 'group' + elif 'ID' in attrs: + id_attr = 'ID' + elif 'Parent' in attrs: + id_attr = 'Parent' + if id_attr: + attrs['transcript_id'] = attrs['gene_id'] = attrs[id_attr] + elif gff_format == 'GFF3': + format_string = '%s=%s' + attrs_strs = [] + for name, value in attrs.items(): + attrs_strs.append(format_string % (name, value)) + return " ; ".join(attrs_strs) + + +stderr = sys.argv[1] +global_model_file_name = sys.argv[2] +transcripts = sys.argv[3] + +# Read standard error to get total map/upper quartile mass. +total_map_mass = -1 +with open(stderr, 'r') as tmp_stderr2: + for line in tmp_stderr2: + if line.lower().find("map mass") >= 0 or line.lower().find("upper quartile") >= 0: + total_map_mass = float(line.split(":")[1].strip()) + break + +if global_model_file_name != "None": + # Global model is simply total map mass from original run. + with open(global_model_file_name, 'r') as global_model_file: + global_model_total_map_mass = float(global_model_file.readline()) + + # Ratio of global model's total map mass to original run's map mass is + # factor used to adjust FPKM. + fpkm_map_mass_ratio = total_map_mass / global_model_total_map_mass + + # Update FPKM values in transcripts.gtf file. + with open(transcripts, 'r') as transcripts_file: + with tempfile.NamedTemporaryFile(dir=".", delete=False) as new_transcripts_file: + for line in transcripts_file: + fields = line.split('\t') + attrs = parse_gff_attributes(fields[8]) + attrs["FPKM"] = str(float(attrs["FPKM"]) * fpkm_map_mass_ratio) + attrs["conf_lo"] = str(float(attrs["conf_lo"]) * fpkm_map_mass_ratio) + attrs["conf_hi"] = str(float(attrs["conf_hi"]) * fpkm_map_mass_ratio) + fields[8] = gff_attributes_to_str(attrs, "GTF") + new_transcripts_file.write("%s\n" % '\t'.join(fields)) + shutil.move(new_transcripts_file.name, transcripts) + +if total_map_mass > -1: + with open("global_model.txt", 'w') as f: + f.write("%f\n" % total_map_mass)