Mercurial > repos > brenninc > subread_featurecounts1_5_0_p1
changeset 0:58ad7b512590 draft default tip
Uploaded
author | brenninc |
---|---|
date | Thu, 12 May 2016 09:48:15 -0400 |
parents | |
children | |
files | name_changer.py subread_featurecounts.xml tool-data/gene_transfer.loc.sample tool_data_table_conf.xml.sample tool_dependencies.xml |
diffstat | 5 files changed, 303 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/name_changer.py Thu May 12 09:48:15 2016 -0400 @@ -0,0 +1,102 @@ +#!/usr/bin/env python + +import optparse +import os.path + + +def fix_header_line(start_header, header_line, new_names): + header_parts = header_line.split("\t") + if len(header_parts) <= len(start_header): + raise Exception("Only found {0} columns in second (header) line expected at least {1}.".format(len(header_parts), (len(start_header) + 1))) + data_headers = header_parts[:len(start_header)] + if data_headers != start_header: + raise Exception("Unexpected start to second (header) line Found: ") + new_header = "\t".join(start_header) + file_headers = header_parts[len(start_header):] + if len(file_headers) != len(new_names): + raise Exception("Found {0} file columns in header line, but {1} new_name paramters provided.".format(len(file_headers), len(new_names))) + for i in range(len(file_headers)): + new_header += "\t" + new_header += new_names[i] + new_header += "\n" + return new_header + + +def clean_names(prefix, old_names): + if len(old_names) > 1: + shared_start = old_names[0].strip() + shared_ends = old_names[0].strip() + for name in old_names: + clean = name.strip() + while len(shared_start) > 0 and (not clean.startswith(shared_start)): + shared_start = shared_start[:-1] + while len(shared_ends) > 0 and (not clean.endswith(shared_ends)): + shared_ends = shared_ends[1:] + start = len(shared_start) + end = 0 - len(shared_ends) + else: + start = 0 + end = 0 + new_names = [] + if end < 0: + for name in old_names: + new_names.append(prefix + name.strip()[start:end]) + else: + for name in old_names: + new_names.append(prefix + name.strip()[start:]) + return new_names + + +def main(): + #Parse Command Line + parser = optparse.OptionParser() + parser.add_option("--raw_count_file", action="store", type="string", default=None, help="path to file original with the counts") + parser.add_option("--fixed_count_file", action="store", type="string", default=None, help="new path for renamaned counts file") + parser.add_option("--raw_summary_file", action="store", type="string", default=None, help="path to file original with the summary") + parser.add_option("--fixed_summary_file", action="store", type="string", default=None, help="new path for renamaned summary file") + parser.add_option("--names_file", action="store", type="string", default=None, help="path to file which contains the names.") + parser.add_option("--new_name", action="append", type="string", default=None, + help="Names to be used. Must be the same length as in the raw_count_file") + parser.add_option("--names_prefix", action="store", type="string", default="", help="Prefix to add in from of every name.") + + (options, args) = parser.parse_args() + + if not os.path.exists(options.raw_count_file): + parser.error("Unable to find raw_count_file {0}.".format(options.raw_count_file)) + if options.names_file: + if options.new_name: + parser.error("names_file parameter clashes with new_names paramter(s)") + if not os.path.exists(options.names_file): + parser.error("Unable to find names_file {0}.".format(options.names_file)) + new_names = [] + with open(options.names_file, "r") as names_file: + for line in names_file: + new_names.append(line.strip()) + new_names = clean_names(options.names_prefix, new_names) + else: + if not options.new_name: + parser.error("No names_file or new_name paraters provided.") + new_names = options.new_name + + print "Changing column names to ", new_names + + with open(options.raw_count_file, "r") as input_file: + with open(options.fixed_count_file, "w") as output_file: + input_file.readline() # job line + start_header = ["Geneid", "Chr", "Start", "End", "Strand", "Length"] + header_line = fix_header_line(start_header, input_file.readline(), new_names) + output_file.write(header_line) + for line in input_file: + output_file.write(line) + + with open(options.raw_summary_file, "r") as input_file: + with open(options.fixed_summary_file, "w") as output_file: + start_header = ["Status"] + header_line = fix_header_line(start_header, input_file.readline(), new_names) + output_file.write(header_line) + for line in input_file: + output_file.write(line) + + +if __name__ == "__main__": + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/subread_featurecounts.xml Thu May 12 09:48:15 2016 -0400 @@ -0,0 +1,174 @@ +<tool id="subread_featurecounts" name="FeatureCount from subread" version="1.5.0-p1"> + <description>Runs FeatureCount from subread</description> + <requirements> + <requirement type="package" version="1.5.0-p1">subread</requirement> + </requirements> + <stdio> + <exit_code range="1" level="fatal" description="Error code 1 occurred" /> + <exit_code range="2:255" level="fatal" description="Unknown error occurred" /> + </stdio> + <command> + featureCounts -p -t exon -g gene_id + #if $reference_source.reference_source_selector=='history': + -a $reference_source.ref_file + #end if + #if $reference_source.reference_source_selector=='cached': + -a $reference_source.ref_path.fields.path + #end if + -o counts + #if $names_source.names_source_selector=='manual': + #for $s in $names_source.input_serie + $s.input_file + #end for + #else + #for $input in $names_source.inputs + "${input}" + #end for + #end if + ; + #if $names_source.names_source_selector in ["file","manual"]: + python $__tool_directory__/name_changer.py + --raw_count_file counts --fixed_count_file ${output} + --raw_summary_file counts.summary --fixed_summary_file "${summary}" + #if $names_source.names_source_selector=='file': + --names_file ${names_source.names_file} + #if $names_source.names_prefix: + --names_prefix ${names_source.names_prefix} + #end if + #else: + #for $s in $names_source.input_serie + --new_name $s.new_name + #end for + #end if + #else + cp counts "${output}" ; + cp counts.summary "${summary}" + #end if + </command> + <inputs> + <conditional name="reference_source"> + <param name="reference_source_selector" type="select" label="Choose the source for the gene_transfer (gtf) file."> + <option value="cached">Locally cached</option> + <option value="history">History</option> + </param> + <when value="cached"> + <param name="ref_path" type="select" label="Using reference gene transfer"> + <options from_data_table="gene_transfer"/> + <validator type="no_options" message="A built-in reference gene transfer is not available."/> + </param> + </when> + <when value="history"> + <param name="ref_file" type="data" format="gtf" label="Using reference file" /> + </when> + </conditional> + <conditional name="names_source"> + <param name="names_source_selector" type="select" label="How are the inputs organized?"> + <option value="file">Collection of /Multiple bam file plus File with list of names.</option> + <option value="galaxy_path">Collection of /Multiple bam files, but no file with list of names.</option> + <option value="manual">Manually enter each file and a name for that file.</option> + </param> + <when value="file"> + <param name="inputs" format="bam" multiple="True" label="Bam file(s) to count Features of" type="data" /> + <param name="names_file" format="txt" + label="File which has the names for the columns. Note start and end strings shared by every name will be removed" + type="data" /> + <param name="names_prefix" size="30" type="text" value="" label="Prefix to add before every column name"/> + </when> + <when value="manual"> + <repeat name="input_serie" title="Files and names to add"> + <param name="input_file" format="bam" label="Bam file to count Features of" type="data" /> + <param name="new_name" size="30" type="text" value="" label="Name for that column"/> + </repeat> + </when> + <when value="galaxy_path"> + <param name="inputs" format="bam" multiple="True" label="Bam file(s) to count Features of" type="data" /> + </when> + </conditional> + </inputs> + <outputs> + <data format="tabular" name="output" label="Counted Features" /> + <data format="tabular" name="summary" label="Feature Count Summary" /> + </outputs> + <tests> + <!-- Test data too large to be included but can be found at + https://github.com/Christian-B/galaxy_shedtools/tree/master/subread_featurecounts --> + <test> + <param ftype="bam" name="inputs" value="C75_sorted.bam" /> + <param name="reference_source|reference_source_selector" value="cached"/> + <param ftype="gtf" name="reference_source|ref_path" value="gencode.vM5" /> + <param name="names_source|names_source_selector" value="galaxy_path"/> + <output name="output" file="C75_FeatureCounts.tsv" ftype="tabular" compare="sim_size" delta="10000"/> + <output name="summary"> + <assert_contents> + <has_text text="Unassigned_Ambiguity" /> + </assert_contents> + </output> + </test> + <test> + <param ftype="bam" name="inputs" value="C01_sorted.bam,C02_sorted.bam,C75_sorted.bam" /> + <param name="reference_source|reference_source_selector" value="history"/> + <param ftype="gtf" name="reference_source|ref_file" value="/home/christian/Dropbox/Gene_data/gencode.vM5.annotation.gtf" /> + <param name="names_source|names_source_selector" value="file"/> + <param ftype="txt" name="names_source|names_file" value="names.dat" /> + <param name="names_source|names_prefix" value="prefix_"/> + <output name="output"> + <assert_contents> + <has_text text="prefix__1" /> + <has_text text="prefix__2" /> + <has_text text="prefix_75" /> + </assert_contents> + </output> + <output name="summary"> + <assert_contents> + <has_text text="Unassigned_Ambiguity" /> + </assert_contents> + </output> + </test> + <test> + <param name="reference_source|reference_source_selector" value="history"/> + <param ftype="gtf" name="reference_source|ref_file" value="/home/christian/Dropbox/Gene_data/gencode.vM5.annotation.gtf" /> + <param name="names_source|names_source_selector" value="manual"/> + <param ftype="bam" name="names_source|input_serie_0|input_file" value="C01_sorted.bam" /> + <param name="names_source|input_serie_0|new_name" value="ForC1"/> + <param ftype="bam" name="names_source|input_serie_1|input_file" value="C02_sorted.bam" /> + <param name="names_source|input_serie_1|new_name" value="ForC2"/> + <param ftype="bam" name="names_source|input_serie_2|input_file" value="C75_sorted.bam" /> + <param name="names_source|input_serie_2|new_name" value="ForC75"/> + <output name="output"> + <assert_contents> + <has_text text="ForC1" /> + <has_text text="ForC2" /> + <has_text text="ForC75" /> + </assert_contents> + </output> + <output name="summary"> + <assert_contents> + <has_text text="Unassigned_Ambiguity" /> + </assert_contents> + </output> + </test> + </tests> + <help> +<![CDATA[ +This tool runs subread's Featurecounts function. + +Feature count labels the data columns with the input file names. Which will be the .../000/024.dat style names used by galaxy. + +This tool therefor post processes the result changing these column names with either values found in a file or entered manually. +In each case every name can be prefixed with the same value +]]> + </help> + <citations> + <citation type="bibtex"> + @misc{ + Subread, + author = {Liao Y, Smyth GK and Shi W}, + title = {Subread (incl FeatureCount on SourceForge}, + url = {http://subread.sourceforge.net/} + } + </citation> + <citation type="doi">10.1093/bioinformatics/btt656</citation> + <citation type="doi">10.1093/nar/gkt214</citation> + </citations> + +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/gene_transfer.loc.sample Thu May 12 09:48:15 2016 -0400 @@ -0,0 +1,14 @@ +#This file lists the locations and dbkeys of all the gene transfer files + +#This file has the format (white space characters are TAB characters): +# +#<unique_build_id> <dbkey> <display_name> <file_path> +# +#So, gene_transfer.loc could look something like this: +# +#vm5 vm5 vM5 annotation /path/to/vM5.annotation.gtf +# +#Your gene_transfer.loc file should contain an entry for each individual +#gtf file. +# +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Thu May 12 09:48:15 2016 -0400 @@ -0,0 +1,7 @@ +<!-- Use the file tool_data_table_conf.xml.oldlocstyle if you don't want to update your loc files as changed in revision 4550:535d276c92bc--> +<tables> + <table name="gene_transfer" comment_char="#"> + <columns>value, dbkey, name, path</columns> + <file path="tool-data/gene_transfer.loc" /> + </table> + </tables>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Thu May 12 09:48:15 2016 -0400 @@ -0,0 +1,6 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="subread" version="1.5.0-p1"> + <repository changeset_revision="7f2795b29d61" name="package_subread_1_5_0_p1" owner="brenninc" prior_installation_required="True" toolshed="https://toolshed.g2.bx.psu.edu" /> + </package> +</tool_dependency>