Mercurial > repos > iuc > samtools_merge

--- a/macros.xml	Sun Oct 14 13:44:49 2018 -0400
+++ b/macros.xml	Tue Sep 28 16:14:52 2021 +0000
@@ -5,10 +5,16 @@
             <yield/>
         </requirements>
     </xml>
-    <token name="@TOOL_VERSION@">1.9</token>
-    <token name="@FLAGS@">#set $flags = sum(map(int, str($filter).split(',')))</token>
+    <token name="@TOOL_VERSION@">1.13</token>
+    <token name="@PROFILE@">20.05</token>
+    <token name="@FLAGS@"><![CDATA[
+        #set $flags = 0
+        #if $filter
+            #set $flags = sum(map(int, str($filter).split(',')))
+        #end if
+    ]]></token>
     <token name="@PREPARE_IDX@"><![CDATA[
-        ##prepare input and indices
+        ##prepare input and indices
         ln -s '$input' infile &&
         #if $input.is_of_type('bam'):
             #if str( $input.metadata.bam_index ) != "None":
@@ -25,7 +31,7 @@
         #end if
     ]]></token>
     <token name="@PREPARE_IDX_MULTIPLE@"><![CDATA[
-        ##prepare input and indices
+        ##prepare input and indices
         #for $i, $bam in enumerate( $input_bams ):
             ln -s '$bam' '${i}' &&
             #if $bam.is_of_type('bam'):
@@ -63,6 +69,51 @@
             #set reffai=None
         #end if
     ]]></token>
+
+    <xml name="optional_reference">
+        <conditional name="addref_cond">
+            <param name="addref_select" type="select" label="Use a reference sequence">
+                <help>@HELP@</help>
+                <option value="no">No</option>
+                <option value="history">Use a genome/index from the history</option>
+                <option value="cached">Use a built-in genome</option>
+            </param>
+            <when value="no"/>
+            <when value="history">
+                <param name="ref" argument="@ARGUMENT@" type="data" format="fasta,fasta.gz" label="Reference"/>
+            </when>
+            <when value="cached">
+                <param name="ref" argument="@ARGUMENT@" type="select" label="Reference">
+                    <options from_data_table="fasta_indexes">
+                        <filter type="data_meta" ref="input" key="dbkey" column="dbkey"/>
+                    </options>
+                    <validator  type="no_options" message="No reference genome is available for the build associated with the selected input dataset"/>
+                </param>
+            </when>
+        </conditional>
+    </xml>
+    <xml name="mandatory_reference" token_help="" token_argument="">
+        <conditional name="addref_cond">
+            <param name="addref_select" type="select" label="Use a reference sequence">
+                <help>@HELP@</help>
+                <option value="history">Use a genome/index from the history</option>
+                <option value="cached">Use a built-in genome</option>
+            </param>
+            <when value="history">
+                <param name="ref" argument="@ARGUMENT@" type="data" format="fasta,fasta.gz" label="Reference"/>
+            </when>
+            <when value="cached">
+                <param name="ref" argument="@ARGUMENT@" type="select" label="Reference">
+                    <options from_data_table="fasta_indexes">
+                        <filter type="data_meta" ref="input" key="dbkey" column="dbkey"/>
+                        <validator message="No reference genome is available for the build associated with the selected input dataset" type="no_options" />
+                    </options>
+                </param>
+            </when>
+        </conditional>
+    </xml>
+
+
     <token name="@ADDTHREADS@"><![CDATA[
         ##compute the number of ADDITIONAL threads to be used by samtools (-@)
         addthreads=\${GALAXY_SLOTS:-1} && (( addthreads-- )) &&
@@ -70,28 +121,28 @@
     <token name="@ADDMEMORY@"><![CDATA[
         ##compute the number of memory available to samtools sort (-m)
         ##use only 75% of available: https://github.com/samtools/samtools/issues/831
-        addmemory=\${GALAXY_MEMORY_MB_PER_SLOT:-768} &&
+        addmemory=\${GALAXY_MEMORY_MB_PER_SLOT:-768} &&
         ((addmemory=addmemory*75/100)) &&
     ]]></token>
     <xml name="seed_input">
-       <param name="seed" type="integer" optional="True" label="Seed for random number generator" help="If empty a random seed is used." />
+       <param name="seed" type="integer" optional="True" label="Seed for random number generator" help="If empty a random seed is used." />
     </xml>
-    <xml name="flag_options">
-        <option value="1">read is paired</option>
-        <option value="2">read is mapped in a proper pair</option>
-        <option value="4">read is unmapped</option>
-        <option value="8">mate is unmapped</option>
-        <option value="16">read reverse strand</option>
-        <option value="32">mate reverse strand</option>
-        <option value="64">read is the first in a pair</option>
-        <option value="128">read is the second in a pair</option>
-        <option value="256">alignment or read is not primary</option>
-        <option value="512">read fails platform/vendor quality checks</option>
-        <option value="1024">read is a PCR or optical duplicate</option>
-        <option value="2048">supplementary alignment</option>
+    <xml name="flag_options" token_s1="false" token_s2="false" token_s4="false" token_s8="false" token_s16="false" token_s32="false" token_s64="false" token_s128="false" token_s256="false" token_s512="false" token_s1024="false" token_s2048="false">
+        <option value="1" selected="@S1@">Read is paired</option>
+        <option value="2" selected="@S2@">Read is mapped in a proper pair</option>
+        <option value="4" selected="@S4@">Read is unmapped</option>
+        <option value="8" selected="@S8@">Mate is unmapped</option>
+        <option value="16" selected="@S16@">Read is mapped to the reverse strand of the reference</option>
+        <option value="32" selected="@S32@">Mate is mapped to the reverse strand of the reference</option>
+        <option value="64" selected="@S64@">Read is the first in a pair</option>
+        <option value="128" selected="@S128@">Read is the second in a pair</option>
+        <option value="256" selected="@S256@">Alignment of the read is not primary</option>
+        <option value="512" selected="@S512@">Read fails platform/vendor quality checks</option>
+        <option value="1024" selected="@S1024@">Read is a PCR or optical duplicate</option>
+        <option value="2048" selected="@S2048@">Alignment is supplementary</option>
     </xml>

-    <!-- region specification macros and tokens for tools that allow the specification
+    <!-- region specification macros and tokens for tools that allow the specification
          of region by bed file / space separated list of regions -->
     <token name="@REGIONS_FILE@"><![CDATA[
         #if $cond_region.select_region == 'tab':
--- a/samtools_merge.xml	Sun Oct 14 13:44:49 2018 -0400
+++ b/samtools_merge.xml	Tue Sep 28 16:14:52 2021 +0000
@@ -1,4 +1,4 @@
-<tool id="samtools_merge" name="Samtools merge" version="@TOOL_VERSION@">
+<tool id="samtools_merge" name="Samtools merge" version="@TOOL_VERSION@" profile="@PROFILE@">
     <description>merge multiple sorted alignment files</description>
     <macros>
         <import>macros.xml</import>
@@ -8,36 +8,38 @@
     <expand macro="version_command"/>
     <command><![CDATA[
 @ADDTHREADS@
-
-## prepare input (sort sam, link other), determine input ordering
+## prepare input (sort sam, link other), determine input ordering
 ## and check for consistent input ordering
 #set sortby=''
 #for $i, $bam in enumerate( $bamfiles ):
     #if $bam.is_of_type('sam', 'cram', ):
         samtools sort
-        -@ \$addthreads -m \${GALAXY_MEMORY_MB:-768}M -T sorttemp
+        -@ \$addthreads -m \${GALAXY_MEMORY_MB:-768}M -T "\${TMPDIR:-.}"
         -O sam
         -o ${i}.sam
         '$bam' &&
     #else:
         ln -s '$bam' ${i}.sam &&
     #end if
-
     #if ($sortby=='coord' or $sortby=='') and $bam.is_of_type('sam','bam','cram'):
         #set sortby='coord'
     #else if ($sortby=='name' or $sortby=='') and $bam.is_of_type('qname_sorted.bam', 'qname_input_sorted.bam'):
         #set $sortby='name'
     #else:
-        >&2 echo "inconsistently sorted input" &&
+        >&2 echo "inconsistently sorted input" &&
         exit 1 &&
     #end if
 #end for
-
+#if $bed_file
+    #for $i, $bam in enumerate( $bamfiles ):
+        samtools index ${i}.sam &&
+    #end for
+#end if
 samtools merge
 -@ \$addthreads
 -s $seed
 ## TODO force overwrite seems necessay (but I do not understand why ...)
--f
+-f
 ## Galaxy provides only default compression
 ## #if $compression == 'levelone'
 ##     -1
@@ -51,13 +53,16 @@
     -n
 #end if
 ## TODO since galaxy can't represent this as data type at the moment this option is unsupported
-## -t TAG    The input alignments have been sorted by the value of TAG, then by either position or name (if -n is given).
+## -t TAG    The input alignments have been sorted by the value of TAG, then by either position or name (if -n is given).
 #if str($region) != ''
     -R '$region'
 #end if
 ## Attach an RG tag to each alignment. The tag value is inferred from file names.
 ## -r
 ## TODO -r makes no sense with the link names, is there some data set metadata (tags,...) that could be used?
+#if $bed_file:
+    -L '$bed_file'
+#end if
 $idrg
 $idpg
 $output
@@ -67,6 +72,7 @@
     ]]></command>
     <inputs>
         <param name="bamfiles" type="data" format="sam,bam,cram" multiple="true" optional="false" label="Alignments in BAM format" help="Sets of aligned reads." />
+        <param name="bed_file" type="data" optional="true" format="bed" label="Merge only reads overlapping the specified regions in the BED file" />
         <param name="region" type="text" optional="true" argument="-n" label="Merge files in a region" help="Merge files in the specified region indicated by a string" />
         <param name="headerbam" type="data" format="sam,bam" argument="-h" multiple="false" optional="true" label="File to take @headers from" help="Use the lines of FILE as `@' headers to be copied to out.bam, replacing any header lines that would otherwise be copied from in1.bam. (FILE is actually in SAM format, though any alignment records it may contain are ignored.)" />
         <param name="idrg" type="boolean" argument="-c" truevalue="-c" falsevalue="" checked="false" label="Make @RG headers unique" help="When several input files contain @RG headers with the same ID, emit only one of them (namely, the header line from the first file we find that ID in) to the merged output file. Combining these similar headers is usually the right thing to do when the files being merged originated from the same file. Without -c, all @RG headers appear in the output file, with random suffices added to their IDs where necessary to differentiate them." />
@@ -83,37 +89,44 @@
         <data name="output" format="bam" />
     </outputs>
     <tests>
-        <!-- tests and data extracted from
+        <!--tests and data extracted from
              https://github.com/samtools/samtools/blob/9ce8c64493f7ea3fa69bc5c1ac980b1a8e3dcf1f/test/test.pl
              https://github.com/samtools/samtools/tree/develop/test/merge -->
-        <!-- # Merge 1 - Standard 3 file SAM merge all presented on the command line (only checks for similar size, because generated header info differs) -->
+        <!-- Merge 1 - Standard 3 file SAM merge all presented on the command line (only checks for similar size, because generated header info differs) -->
         <test>
             <param name="bamfiles" value="test_input_1_a.sam,test_input_1_b.sam,test_input_1_c.sam" />
-            <output name="output" file="2.merge.expected.bam" compare="sim_size" delta="50" />
+            <output name="output" file="1.merge.expected.bam" ftype="bam" lines_diff="16" />
         </test>
         <!-- Merge 2 - Standard 3 file BAM merge all files presented on the command line -->
         <test>
             <param name="bamfiles" value="test_input_1_a.bam,test_input_1_b.bam,test_input_1_c.bam" />
-            <output name="output" file="2.merge.expected.bam" compare="sim_size" delta="50" />
+            <output name="output" file="2.merge.expected.bam" ftype="bam" lines_diff="16" />
         </test>
-        <!-- Merge 4 - 1 file BAM merge with file presented on the command line -->
+        <!-- Merge 3 - 1 file BAM merge with file presented on the command line -->
         <test>
             <param name="bamfiles" value="test_input_1_b.bam" />
-            <output name="output" file="4.merge.expected.bam" compare="sim_size" delta="50" />
+            <output name="output" file="3.merge.expected.bam" ftype="bam" lines_diff="16" />
         </test>
-        <!--Merge 5 - 3 file SAM merge all presented on the command line override IDs to file names (not implemented in tool) -->
-        <!--Merge 6 - merge all presented on the command line, combine PG and RG rather than dedup -->
+        <!--Merge 4 - merge all presented on the command line, combine PG and RG rather than dedup -->
         <test>
             <param name="bamfiles" value="test_input_1_a.bam,test_input_1_b.bam" />
             <param name="idrg" value="-c" />
             <param name="idpg" value="-p" />
-            <output name="output" file="6.merge.expected.bam" compare="sim_size" delta="50" />
+            <output name="output" file="4.merge.expected.bam" ftype="bam" lines_diff="16" />
         </test>
-        <!-- Merge 7 - ID and SN with regex in them (probably not necessary for the galaxy tool because just different input) -->
+        <!-- Merge 5 - ID and SN with regex in them (probably not necessary for the galaxy tool because just different input) -->
         <test>
             <param name="bamfiles" value="test_input_1_a_regex.sam,test_input_1_b_regex.sam" />
-            <output name="output" file="7.merge.expected.bam" compare="sim_size" delta="50" />
+            <output name="output" file="5.merge.expected.bam" ftype="bam" lines_diff="16" />
         </test>
+        <!-- Merge 6 - Merging with bedfile -->
+        <test>
+            <param name="bamfiles" value="test_input_1_a.bam,test_input_1_b.bam,test_input_1_c.bam" />
+            <param name="bed" value="yes" />
+            <param name="bed_file" value="test_input_1_a.bed" />
+            <output name="output" file="6.merge.expected.bam" ftype="bam" lines_diff="16" />
+        </test>
+        <!--Merge - 3 file SAM merge all presented on the command line override IDs to file names (not implemented in tool) -->
         <!-- Sort inputs by PG, then merge (not implemented, since -t not supported in the tool) -->
         <!-- Sort inputs by PG, then merge (name sorted) (not implemented, since -t not supported in the tool) -->
     </tests>
@@ -123,7 +136,7 @@

 If a file to take @headers from is specified the @SQ headers of input files will be merged into the specified header, otherwise they will be merged into a composite header created from the input headers. If in the process of merging @SQ lines for coordinate sorted input files, a conflict arises as to the order (for example input1.bam has @SQ for a,b,c and input2.bam has b,a,c) then the resulting output file will need to be re-sorted back into coordinate order.

-Unless the @PG/@RG headers are made unique when merging @RG and @PG records into the output header then any IDs found to be duplicates of existing IDs in the output header will have a suffix appended to them to differentiate them from similar header records from other files and the read records will be updated to reflect this.
+Unless the @PG/@RG headers are made unique when merging @RG and @PG records into the output header then any IDs found to be duplicates of existing IDs in the output header will have a suffix appended to them to differentiate them from similar header records from other files and the read records will be updated to reflect this.
     </help>
     <expand macro="citations"/>
 </tool>
Binary file test-data/1.merge.expected.bam has changed
Binary file test-data/2.merge.expected.bam has changed
Binary file test-data/3.merge.expected.bam has changed
Binary file test-data/4.merge.expected.bam has changed
Binary file test-data/5.merge.expected.bam has changed
Binary file test-data/6.merge.expected.bam has changed
Binary file test-data/7.merge.expected.bam has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_input_1_a.bed	Tue Sep 28 16:14:52 2021 +0000
@@ -0,0 +1,14 @@
+insert	49	59	r000/1	30	+
+insert	79	89	r000/1/2	30	-
+ref1	6	22	r001/2	30	+
+ref1	8	18	r002	30	+
+ref1	8	14	r003	30	+
+ref1	15	40	r004	30	+
+ref1	28	33	r003	30	-
+ref1	36	45	r001/1	30	-
+ref2	0	20	x1	30	+
+ref2	1	22	x2	30	+
+ref2	5	27	x3	30	+
+ref2	9	34	x4	30	+
+ref2	11	35	x5	30	+
+ref2	13	36	x6	30	+