Galaxy |

Changeset 12:d080005cffe1 (2020-06-16)

Previous changeset 11:e04dbae2abe0 (2017-02-19)

Commit message:
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/cufflinks/cufflinks commit a0b0845a9d1b3e7ecdeacd1e606133617e3918bd"

modified:
cuff_macros.xml
cufflinks_wrapper.xml
test-data/cufflinks_out4.txt

added:
mass.py

removed:
cufflinks_wrapper.py

diff -r e04dbae2abe0 -r d080005cffe1 cuff_macros.xml
--- a/cuff_macros.xml Sun Feb 19 12:12:28 2017 -0500
+++ b/cuff_macros.xml Tue Jun 16 13:00:32 2020 -0400

[

@@ -66,20 +66,26 @@
   </token>
   <xml name="cufflinks_gtf_inputs">
     <param format="gtf" name="inputs" type="data" label="GTF file(s) produced by Cufflinks" help="" multiple="true" />
-    <repeat name="additional_inputs" title="Additional GTF Inputs (Lists)">
-      <param format="gtf" name="additional_inputs" type="data_collection" label="GTF file(s) produced by Cufflinks" help="" />
-    </repeat>
   </xml>
+  <token name="@CUFFLINKS_LINK_GTF_INPUTS@"><![CDATA[
+            ## Inputs.
+            #for $i, $input_file in enumerate($inputs):
+                ln -s '${input_file}' input_$i &&
+            #end for
+  ]]></token>
   <token name="@CUFFLINKS_GTF_INPUTS@">
             ## Inputs.
-            #for $input_file in $inputs:
-                '${input_file}'
-            #end for
-            #for $additional_input in $additional_inputs:
-                #for $input_file in $additional_input.additional_inputs:
-                    '${input_file}'
-                #end for
+            #for $i, $input_file in enumerate($inputs):
+                'input_$i'
             #end for
   </token>
   <token name="@HAS_MULTIPLE_INPUTS@">getattr(inputs, "__len__", [].__len__)() >= 2</token>
+
+  <xml name="citations">
+    <citations>
+        <citation type="doi">10.1038/nbt.1621</citation>
+        <yield/>
+    </citations>
+  </xml>
+
</macros>

diff -r e04dbae2abe0 -r d080005cffe1 cufflinks_wrapper.py
--- a/cufflinks_wrapper.py Sun Feb 19 12:12:28 2017 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

b'@@ -1,278 +0,0 @@\n-#!/usr/bin/env python\n-\n-import optparse\n-import os\n-import shutil\n-import subprocess\n-import sys\n-import tempfile\n-\n-\n-def parse_gff_attributes( attr_str ):\n- """\n- Parses a GFF/GTF attribute string and returns a dictionary of name-value\n- pairs. The general format for a GFF3 attributes string is\n-\n- name1=value1;name2=value2\n-\n- The general format for a GTF attribute string is\n-\n- name1 "value1" ; name2 "value2"\n-\n- The general format for a GFF attribute string is a single string that\n- denotes the interval\'s group; in this case, method returns a dictionary\n- with a single key-value pair, and key name is \'group\'\n- """\n- attributes_list = attr_str.split(";")\n- attributes = {}\n- for name_value_pair in attributes_list:\n- # Try splitting by \'=\' (GFF3) first because spaces are allowed in GFF3\n- # attribute; next, try double quotes for GTF.\n- pair = name_value_pair.strip().split("=")\n- if len( pair ) == 1:\n- pair = name_value_pair.strip().split("\\"")\n- if len( pair ) == 1:\n- # Could not split for some reason -- raise exception?\n- continue\n- if pair == \'\':\n- continue\n- name = pair[0].strip()\n- if name == \'\':\n- continue\n- # Need to strip double quote from values\n- value = pair[1].strip(" \\"")\n- attributes[ name ] = value\n-\n- if len( attributes ) == 0:\n- # Could not split attributes string, so entire string must be\n- # \'group\' attribute. This is the case for strictly GFF files.\n- attributes[\'group\'] = attr_str\n- return attributes\n-\n-\n-def gff_attributes_to_str( attrs, gff_format ):\n- """\n- Convert GFF attributes to string. Supported formats are GFF3, GTF.\n- """\n- if gff_format == \'GTF\':\n- format_string = \'%s "%s"\'\n- # Convert group (GFF) and ID, parent (GFF3) attributes to transcript_id, gene_id\n- id_attr = None\n- if \'group\' in attrs:\n- id_attr = \'group\'\n- elif \'ID\' in attrs:\n- id_attr = \'ID\'\n- elif \'Parent\' in attrs:\n- id_attr = \'Parent\'\n- if id_attr:\n- attrs[\'transcript_id\'] = attrs[\'gene_id\'] = attrs[id_attr]\n- elif gff_format == \'GFF3\':\n- format_string = \'%s=%s\'\n- attrs_strs = []\n- for name, value in attrs.items():\n- attrs_strs.append( format_string % ( name, value ) )\n- return " ; ".join( attrs_strs )\n-\n-\n-def stop_err(msg):\n- sys.exit("%s\\n" % msg)\n-\n-\n-def __main__():\n- parser = optparse.OptionParser()\n- parser.add_option(\'-1\', \'--input\', dest=\'input\', help=\' file of RNA-Seq read alignments in the SAM format. SAM is a standard short read alignment, that allows aligners to attach custom tags to individual alignments, and Cufflinks requires that the alignments you supply have some of these tags. Please see Input formats for more details.\')\n- parser.add_option(\'-I\', \'--max-intron-length\', dest=\'max_intron_len\', help=\'The minimum intron length. Cufflinks will not report transcripts with introns longer than this, and will ignore SAM alignments with REF_SKIP CIGAR operations longer than this. The default is 300,000.\')\n- parser.add_option(\'-F\', \'--min-isoform-fraction\', dest=\'min_isoform_fraction\', help=\'After calculating isoform abundance for a gene, Cufflinks filters out transcripts that it believes are very low abundance, because isoforms expressed at extremely low levels often cannot reliably be assembled, and may even be artifacts of incompletely spliced precursors of processed transcripts. This parameter is also used to filter out introns that have far fewer spliced alignments supporting them. The default is 0.05, or 5% of the most abundant isoform (the major isoform) of the gene.\')\n- parser.add_option(\'-j\', \'--pre-mrna-fraction\', dest=\'pre_mrna_fraction\', help=\'Some RNA-Seq protocols produce a significant amount of reads that originate from incompletely splic'..b'" --max-bundle-frags %i" % int(options.max_bundle_frags))\n- if options.min_intron_length:\n- cmd += (" --min-intron-length %i" % int(options.min_intron_length))\n- if options.trim_three_avgcov_thresh:\n- cmd += (" --trim-3-avgcov-thresh %i" % int(options.trim_three_avgcov_thresh))\n- if options.trim_three_dropoff_frac:\n- cmd += (" --trim-3-dropoff-frac %f" % float(options.trim_three_dropoff_frac))\n-\n- if options.do_bias_correction:\n- cmd += (" -b %s" % seq_path)\n- if options.no_effective_length_correction:\n- cmd += (" --no-effective-length-correction")\n- if options.no_length_correction:\n- cmd += (" --no-length-correction")\n-\n- # Add input files.\n- cmd += " " + options.input\n-\n- # Run command and handle output.\n- try:\n- # Run command.\n- with tempfile.NamedTemporaryFile(dir=".") as tmp_stderr:\n- returncode = subprocess.call(args=cmd, stderr=tmp_stderr, shell=True)\n-\n- # Error checking.\n- if returncode != 0:\n- # Get stderr, allowing for case where it\'s very large.\n- buffsize = 1048576\n- stderr = \'\'\n- with open(tmp_stderr.name) as tmp_stderr2:\n- try:\n- while True:\n- stderr += tmp_stderr2.read(buffsize)\n- if not stderr or len(stderr) % buffsize != 0:\n- break\n- except OverflowError:\n- pass\n- raise Exception(stderr)\n-\n- # Read standard error to get total map/upper quartile mass.\n- total_map_mass = -1\n- with open(tmp_stderr.name, \'r\') as tmp_stderr2:\n- for line in tmp_stderr2:\n- if line.lower().find("map mass") >= 0 or line.lower().find("upper quartile") >= 0:\n- total_map_mass = float(line.split(":")[1].strip())\n- break\n-\n- # If there\'s a global model provided, use model\'s total map mass\n- # to adjust FPKM + confidence intervals.\n- if options.global_model_file:\n- # Global model is simply total map mass from original run.\n- with open(options.global_model_file, \'r\') as global_model_file:\n- global_model_total_map_mass = float(global_model_file.readline())\n-\n- # Ratio of global model\'s total map mass to original run\'s map mass is\n- # factor used to adjust FPKM.\n- fpkm_map_mass_ratio = total_map_mass / global_model_total_map_mass\n-\n- # Update FPKM values in transcripts.gtf file.\n- with open("transcripts.gtf", \'r\') as transcripts_file:\n- with tempfile.NamedTemporaryFile(dir=".", delete=False) as new_transcripts_file:\n- for line in transcripts_file:\n- fields = line.split(\'\\t\')\n- attrs = parse_gff_attributes(fields[8])\n- attrs["FPKM"] = str(float(attrs["FPKM"]) * fpkm_map_mass_ratio)\n- attrs["conf_lo"] = str(float(attrs["conf_lo"]) * fpkm_map_mass_ratio)\n- attrs["conf_hi"] = str(float(attrs["conf_hi"]) * fpkm_map_mass_ratio)\n- fields[8] = gff_attributes_to_str(attrs, "GTF")\n- new_transcripts_file.write("%s\\n" % \'\\t\'.join(fields))\n- shutil.move(new_transcripts_file.name, "transcripts.gtf")\n-\n- # TODO: update expression files as well.\n-\n- # Set outputs. Transcript and gene expression handled by wrapper directives.\n- shutil.move("transcripts.gtf", options.assembled_isoforms_output_file)\n- if total_map_mass > -1:\n- with open("global_model.txt", \'w\') as f:\n- f.write("%f\\n" % total_map_mass)\n- except Exception as e:\n- stop_err(\'Error running cufflinks: %s\' % e)\n-\n-\n-if __name__ == "__main__":\n- __main__()\n'

diff -r e04dbae2abe0 -r d080005cffe1 cufflinks_wrapper.xml
--- a/cufflinks_wrapper.xml Sun Feb 19 12:12:28 2017 -0500
+++ b/cufflinks_wrapper.xml Tue Jun 16 13:00:32 2020 -0400

[

@@ -1,14 +1,15 @@
-<tool id="cufflinks" name="Cufflinks" version="@VERSION@.2">
+<tool id="cufflinks" name="Cufflinks" version="@VERSION@.3">
     <description>transcript assembly and FPKM (RPKM) estimates for RNA-Seq data</description>
     <macros>
       <import>cuff_macros.xml</import>
     </macros>
-    <expand macro="requirements" />
-    <version_command>cufflinks 2>&1 | head -n 1</version_command>
-    <command detect_errors="aggressive">
-        python '$__tool_directory__/cufflinks_wrapper.py'
-            --input '$input'
-            --assembled-isoforms-output '$assembled_isoforms'
+    <expand macro="requirements">
+       <requirement type="package" version="3.6">python</requirement>
+    </expand>
+    <version_command><![CDATA[cufflinks 2>&1 | head -n 1]]></version_command>
+    <command detect_errors="aggressive"><![CDATA[
+        cufflinks -q --no-update-check
+            '$input'
             --num-threads "\${GALAXY_SLOTS:-4}"
             -I $max_intron_len
             -F $min_isoform_fraction
@@ -31,30 +32,23 @@
             #if $bias_correction.do_bias_correction == "Yes":
                 -b
                 #if $bias_correction.seq_source.index_source == "history":
-                    --ref_file '$bias_correction.seq_source.ref_file'
+                    '$bias_correction.seq_source.ref_file'
                 #else:
-                    --index '${bias_correction.seq_source.index.fields.path}'
+                    '${bias_correction.seq_source.index.fields.path}'
                 #end if
             #end if

             ## Multi-read correct?
-            #if str($multiread_correct) == "Yes":
-                -u
-            #end if
-
-            ## Include global model if available.
-            #if $global_model:
-                --global_model '$global_model'
-            #end if
-
+            $multiread_correct
+
             ## advanced settings
             #if $advanced_settings.use_advanced_settings == "Yes":
                 --library-type $advanced_settings.library_type
                 #if $advanced_settings.mask_file:
                     --mask-file '$advanced_settings.mask_file'
                 #end if
-                --inner-mean-dist $advanced_settings.inner_mean_dist
-                --inner-dist-std-dev $advanced_settings.inner_dist_std_dev
+                --frag-len-mean $advanced_settings.inner_mean_dist
+                --frag-len-std-dev $advanced_settings.inner_dist_std_dev
                 --max-mle-iterations $advanced_settings.max_mle_iterations
                 --junc-alpha $advanced_settings.junc_alpha
                 --small-anchor-fraction $advanced_settings.small_anchor_fraction
@@ -65,7 +59,10 @@
                 --trim-3-avgcov-thresh $advanced_settings.trim_three_avgcov_thresh
                 --trim-3-dropoff-frac $advanced_settings.trim_three_dropoff_frac
             #end if
-    </command>
+            2> stderr
+
+        && python '$__tool_directory__/mass.py' stderr '$global_model' "transcripts.gtf"
+    ]]></command>
     <inputs>
         <param format="sam,bam" name="input" type="data" label="SAM or BAM file of aligned RNA-Seq reads" help=""/>
         <param name="max_intron_len" type="integer" value="300000" min="1" max="600000" label="Max Intron Length" help="ignore alignments with gaps longer than this"/>
@@ -125,11 +122,9 @@
             <when value="No" />
         </conditional>

-        <param name="multiread_correct" type="select" label="Use multi-read correct"
-            help="Tells Cufflinks to do an initial estimation procedure to more accurately weight reads mapping to multiple locations in the genome.">
-            <option value="No" selected="true">No</option>
-            <option value="Yes">Yes</option>
-        </param>
+        <param name="multiread_correct" type="boolean" label="Use multi-read correct"
+            truevalue="-u" falsevalue="" checked="false"
+            help="Tells Cufflinks to do an initial estimation procedure to more accurately weight reads mapping to multiple locations in the genome."/>

         <param name="length_correction" type="select" label="Apply length correction" help="Mode of length normalization to transcript FPKM.">
             <option value="" selected="true">Cufflinks Effective Length Correction</option>
@@ -175,7 +170,7 @@
     <outputs>
         <data format="tabular" name="genes_expression" label="${tool.name} on ${on_string}: gene expression" from_work_dir="genes.fpkm_tracking"/>
         <data format="tabular" name="transcripts_expression" label="${tool.name} on ${on_string}: transcript expression" from_work_dir="isoforms.fpkm_tracking"/>
-        <data format="gtf" name="assembled_isoforms" label="${tool.name} on ${on_string}: assembled transcripts"/>
+        <data format="gtf" name="assembled_isoforms" label="${tool.name} on ${on_string}: assembled transcripts" from_work_dir="transcripts.gtf"/>
         <data format="txt" name="total_map_mass" label="${tool.name} on ${on_string}: total map mass" hidden="true" from_work_dir="global_model.txt"/>
         <data format="gtf" name="skipped" label="${tool.name} on ${on_string}: Skipped Transcripts" from_work_dir="skipped.gtf"/>
     </outputs>
@@ -200,7 +195,7 @@
             <output name="genes_expression" ftype="tabular" lines_diff="2" file="cufflinks_out3.fpkm_tracking"/>
             <output name="transcripts_expression" ftype="tabular" lines_diff="2" file="cufflinks_out2.fpkm_tracking"/>
             <output name="assembled_isoforms" file="cufflinks_out1.gtf"/>
-            <output name="global_model" file="cufflinks_out4.txt"/>
+            <output name="total_map_mass" file="cufflinks_out4.txt"/>
             <output name="skipped" file="cufflinks_out4.gtf"/>
         </test>
     </tests>
@@ -298,7 +293,5 @@
   -G        Tells Cufflinks to use the supplied reference annotation to estimate isoform expression. It will not assemble novel transcripts, and the program will ignore alignments not structurally compatible with any reference transcript.
   -N        With this option, Cufflinks excludes the contribution of the top 25 percent most highly expressed genes from the number of mapped fragments used in the FPKM denominator. This can improve robustness of differential expression calls for less abundant genes and transcripts.
     </help>
-    <citations>
-        <citation type="doi">10.1038/nbt.1621</citation>
-    </citations>
+    <expand macro="citations"/>
</tool>

diff -r e04dbae2abe0 -r d080005cffe1 mass.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mass.py Tue Jun 16 13:00:32 2020 -0400

[

@@ -0,0 +1,108 @@
+import shutil
+import sys
+import tempfile
+
+
+def parse_gff_attributes(attr_str):
+    """
+    Parses a GFF/GTF attribute string and returns a dictionary of name-value
+    pairs. The general format for a GFF3 attributes string is
+
+        name1=value1;name2=value2
+
+    The general format for a GTF attribute string is
+
+        name1 "value1" ; name2 "value2"
+
+    The general format for a GFF attribute string is a single string that
+    denotes the interval's group; in this case, method returns a dictionary
+    with a single key-value pair, and key name is 'group'
+    """
+    attributes_list = attr_str.split(";")
+    attributes = {}
+    for name_value_pair in attributes_list:
+        # Try splitting by '=' (GFF3) first because spaces are allowed in GFF3
+        # attribute; next, try double quotes for GTF.
+        pair = name_value_pair.strip().split("=")
+        if len(pair) == 1:
+            pair = name_value_pair.strip().split("\"")
+        if len(pair) == 1:
+            # Could not split for some reason -- raise exception?
+            continue
+        if pair == '':
+            continue
+        name = pair[0].strip()
+        if name == '':
+            continue
+        # Need to strip double quote from values
+        value = pair[1].strip(" \"")
+        attributes[name] = value
+
+    if len(attributes) == 0:
+        # Could not split attributes string, so entire string must be
+        # 'group' attribute. This is the case for strictly GFF files.
+        attributes['group'] = attr_str
+    return attributes
+
+
+def gff_attributes_to_str(attrs, gff_format):
+    """
+    Convert GFF attributes to string. Supported formats are GFF3, GTF.
+    """
+    if gff_format == 'GTF':
+        format_string = '%s "%s"'
+        # Convert group (GFF) and ID, parent (GFF3) attributes to transcript_id, gene_id
+        id_attr = None
+        if 'group' in attrs:
+            id_attr = 'group'
+        elif 'ID' in attrs:
+            id_attr = 'ID'
+        elif 'Parent' in attrs:
+            id_attr = 'Parent'
+        if id_attr:
+            attrs['transcript_id'] = attrs['gene_id'] = attrs[id_attr]
+    elif gff_format == 'GFF3':
+        format_string = '%s=%s'
+    attrs_strs = []
+    for name, value in attrs.items():
+        attrs_strs.append(format_string % (name, value))
+    return " ; ".join(attrs_strs)
+
+
+stderr = sys.argv[1]
+global_model_file_name = sys.argv[2]
+transcripts = sys.argv[3]
+
+# Read standard error to get total map/upper quartile mass.
+total_map_mass = -1
+with open(stderr, 'r') as tmp_stderr2:
+    for line in tmp_stderr2:
+        if line.lower().find("map mass") >= 0 or line.lower().find("upper quartile") >= 0:
+            total_map_mass = float(line.split(":")[1].strip())
+            break
+
+if global_model_file_name != "None":
+    # Global model is simply total map mass from original run.
+    with open(global_model_file_name, 'r') as global_model_file:
+        global_model_total_map_mass = float(global_model_file.readline())
+
+    # Ratio of global model's total map mass to original run's map mass is
+    # factor used to adjust FPKM.
+    fpkm_map_mass_ratio = total_map_mass / global_model_total_map_mass
+
+    # Update FPKM values in transcripts.gtf file.
+    with open(transcripts, 'r') as transcripts_file:
+        with tempfile.NamedTemporaryFile(dir=".", delete=False) as new_transcripts_file:
+            for line in transcripts_file:
+                fields = line.split('\t')
+                attrs = parse_gff_attributes(fields[8])
+                attrs["FPKM"] = str(float(attrs["FPKM"]) * fpkm_map_mass_ratio)
+                attrs["conf_lo"] = str(float(attrs["conf_lo"]) * fpkm_map_mass_ratio)
+                attrs["conf_hi"] = str(float(attrs["conf_hi"]) * fpkm_map_mass_ratio)
+                fields[8] = gff_attributes_to_str(attrs, "GTF")
+                new_transcripts_file.write("%s\n" % '\t'.join(fields))
+    shutil.move(new_transcripts_file.name, transcripts)
+
+if total_map_mass > -1:
+    with open("global_model.txt", 'w') as f:
+        f.write("%f\n" % total_map_mass)

diff -r e04dbae2abe0 -r d080005cffe1 test-data/cufflinks_out4.txt
--- a/test-data/cufflinks_out4.txt Sun Feb 19 12:12:28 2017 -0500
+++ b/test-data/cufflinks_out4.txt Tue Jun 16 13:00:32 2020 -0400

@@ -1,2 +1,1 @@
-tracking_id class_code nearest_ref_id gene_id gene_short_name tss_id locus length coverage FPKM FPKM_conf_lo FPKM_conf_hi FPKM_status
-CUFF.1.1 - - CUFF.1 - - test_chromosome:52-550 298 145.77 1.06791e+07 8.5427e+06 1.28156e+07 OK
+100.000000