Mercurial > repos > yhoogstrate > edger_with_design_matrix
diff bin/design_matrix_creator @ 1:a4a4c88783ea draft
planemo upload for repository https://bitbucket.org/EMCbioinf/galaxy-tool-shed-tools/raw/master/edger_with_design_matrix commit 2700e500a4fb135a20ede7d52221a9d31f1aaa5e-dirty
author | yhoogstrate |
---|---|
date | Tue, 01 Sep 2015 04:59:05 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/design_matrix_creator Tue Sep 01 04:59:05 2015 -0400 @@ -0,0 +1,222 @@ +#!/usr/bin/env python + +import argparse, os, shutil, sys, tempfile, subprocess + +__version_info__ = ('1', '0', '0')#, 'beta') +__version__ = '.'.join(__version_info__) if (len(__version_info__) == 3) else '.'.join(__version_info__[0:3])+"-"+__version_info__[3] +__author__ = 'Youri Hoogstrate' +__homepage__ = 'https://bitbucket.org/EMCbioinf/galaxy-tool-shed-tools' +__license__ = 'GNU General Public License v3 (GPLv3)' + + +class sampleContainer: + def __init__(self): + self.samples = [] + self.treatments = {} + self.treatment_index = [] + self.treatment_types = {} + + def do_decode(self,encoded_str): + return encoded_str.decode("base64").strip().replace("\t",'') + + def add_samples(self,argument): + print " - Adding samples" + for sample in argument: + self.add_sample(self.do_decode(sample)) + + def add_sample(self,sample): + if(sample in self.samples): + sys.stderr.write("Error:\n* Non-unique sample: "+sample+"\n") + sys.exit(1) + else: + self.samples.append(sample) + print " - Added: "+sample + + def add_blocking(self,argument): + print " - Adding paired samples" + pair = [] + for block in argument: + self.add_block(block) + + def add_block(self,blocks): + blocks = blocks.split(":") + as_treatment = blocks[0] + blocks = blocks[1:] + + used_samples = [] + indexed_samples = {} + + for i in range(len(blocks)): + block = blocks[i] + samples = self.get_samples_from_block(block) + indexed_samples[i+1] = [] + for sample in samples: + if(sample in used_samples): + sys.stderr.write("Error:\n* Blocking contains multiple times the same sample: "+sample+"\n") + sys.exit(0) + else: + indexed_samples[i+1] = block + used_samples.append(sample) + + for sample in self.samples: + if(sample not in used_samples): + i = i + 1 + indexed_samples[i+1] = str(sample).encode('base64').strip() + + for index in indexed_samples.keys(): + key = str(index).encode('base64').strip() + as_treatment += ":"+key+":"+indexed_samples[index] + + self.add_treatment(as_treatment) + + def get_samples_from_block(self,decoded_block): + return [ self.do_decode(x) for x in decoded_block.split(",")] + + def add_treatments(self,argument): + print " - Adding treatments" + for treatment in argument: + self.add_treatment(treatment) + + def add_treatment(self,treatment_argument): + print " - Parsing treatment" + + + treatment_argument = treatment_argument.split(":") + name = self.do_decode(treatment_argument[0]) + treatment_argument = treatment_argument[1:] + + + treatment = {"factor_index":{},"sample_index":{}} + only_integers = True + + i = 1 + for item in treatment_argument: + if(i % 2): + factor = self.do_decode(item) + + if(treatment['factor_index'].has_key(factor)): + sys.stderr.write("Error:\n* Factor has been added multiple times to treatment: "+factor+"\n") + sys.exit(0) + else: + print " - Adding factor: "+factor + treatment["factor_index"][factor] = [] + if(not factor.isdigit()): + only_integers = False + else: + for sample in item.split(","): + sample = self.do_decode(sample) + + if(not sample in self.samples): + sys.stderr.write("Error:\n* Unknown sample: "+sample+"\n") + sys.exit(0) + + treatment["factor_index"][factor].append(sample) + if(treatment["sample_index"].has_key(sample)): + sys.stderr.write("Error:\n* Factor has been added to treatment before: "+sample+"/"+factor+", factors must be mutually exclusive!\n") + sys.exit(0) + else: + treatment["sample_index"][sample] = factor + i += 1 + + treatment_factors = sorted(treatment["factor_index"].keys()) + + if(name == None): + treatment["name"] = "_vs_".join(treatment_factors) + else: + treatment["name"] = str(name) + + if(len(treatment["sample_index"]) != len(self.samples)): + sys.stderr.write("Error:\n* The number of samples for treatment '"+treatment["name"]+"' ("+str(len(treatment["sample_index"]))+") is different from the total number of samples ("+str(len(self.samples))+").\n") + + if(only_integers): + treatment_type = "integer" + else: + treatment_type = "string" + + if(self.treatments.has_key(treatment["name"])): + sys.stderr.write("Error:\n* Treatment was already added: '"+treatment["name"]+"\n") + else: + self.treatments[treatment["name"]] = treatment + self.treatment_index.append(treatment["name"]) + self.treatment_types[treatment["name"]] = treatment_type + print " - Treatment \""+treatment["name"]+"\" of type \""+treatment_type+"\" is valid" + + def export(self,output): + # Open file stream + if(args.output == "-"): + fh = sys.stdout + else: + fh = open(args.output,"w") + + # Write header: + fh.write("sample-name\t"+"\t".join(self.treatment_index)+"\n") + + # Write body: + for sample in self.samples: + fh.write(sample) + for treatment_id in self.treatment_index: + treatment = self.treatments[treatment_id] + fh.write("\t"+treatment["sample_index"][sample]) + fh.write("\n") + + fh.close() + +if __name__=="__main__": + parser = argparse.ArgumentParser(description="Create an edgeR design matrix with read-count datasets.") + parser.add_argument("-o","--output", help="Output file, '-' for stdout.",required=True) + parser.add_argument("-c","--columns-file", nargs="?", help='Use columns of [this] file as UIDs (counting from 1)') + parser.add_argument("-s","--sample-names", nargs="*", help='Sample names (UIDs that correspond to the columns in the expression matrix)') + parser.add_argument("-t","--treatments", nargs="+", help='Treatment or conditions: "name::sample:condition& (sample-names and conditions have to be provided using Base64 encoding to avoid weird characters)',required=True) + parser.add_argument("-b","--blocking", nargs="+", help='Description of sample blocking: "blocking_condition*&sample-1-name&sample-2-name&sample-n-name"') + + args = parser.parse_args() + + columns = None + if(args.columns_file): + with open(args.columns_file, "r") as f: + listed_columns = [None] + f.readline().strip("\n").split("\t") + for i in range(1,len(listed_columns)): + listed_columns[i] = listed_columns[i].encode('base64').replace('\n','') + + s = sampleContainer() + + if(listed_columns): + columns = [] + for sample in args.sample_names: + columns.append(listed_columns[int(sample)]) + + + treatments = [] + for treatment in args.treatments: + treatment = treatment.split(":") + for i in range(1,len(treatment)): + if(i%2 == 0): + treatment_tmp = treatment[i].split(",") + for j in range(len(treatment_tmp)): + treatment_tmp[j] = listed_columns[int(treatment_tmp[j])] + treatment[i] = ",".join(treatment_tmp) + + treatments.append(":".join(treatment)) + + blockings = [] + if(args.blocking): + for blocking in args.blocking: + blocking = blocking.split(":") + for i in range(1,len(blocking)): + block = blocking[i].split(",") + for j in range(len(block)): + block[j] = listed_columns[int(block[j])] + blocking[i] = ",".join(block) + blockings.append(":".join(blocking)) + + s.add_samples(columns) + s.add_treatments(treatments) + s.add_blocking(blockings) + + else: + s.add_samples(args.sample_names) + s.add_treatments(args.treatments) + if(args.blocking): + s.add_blocking(args.blocking) + + s.export(args.output)