Repository 'samtools_merge'
hg clone https://toolshed.g2.bx.psu.edu/repos/iuc/samtools_merge

Changeset 0:740ce0a18f0d (2018-10-14)
Next changeset 1:8890e2d4c068 (2021-09-28)
Commit message:
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/samtools/samtools_merge commit b3426aed6615742d96dfb8f7346a9e0d4e391a99
added:
macros.xml
samtools_merge.xml
test-data/2.merge.expected-samin.bam
test-data/2.merge.expected.bam
test-data/2.merge.expected.sam
test-data/4.merge.expected.bam
test-data/6.merge.expected.bam
test-data/7.merge.expected.bam
test-data/test_input_1_a.bam
test-data/test_input_1_a.sam
test-data/test_input_1_a_regex.sam
test-data/test_input_1_b.bam
test-data/test_input_1_b.sam
test-data/test_input_1_b_regex.sam
test-data/test_input_1_c.bam
test-data/test_input_1_c.sam
b
diff -r 000000000000 -r 740ce0a18f0d macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml Sun Oct 14 13:44:49 2018 -0400
[
b'@@ -0,0 +1,172 @@\n+<macros>\n+    <xml name="requirements">\n+        <requirements>\n+            <requirement type="package" version="@TOOL_VERSION@">samtools</requirement>\n+            <yield/>\n+        </requirements>\n+    </xml>\n+    <token name="@TOOL_VERSION@">1.9</token>\n+    <token name="@FLAGS@">#set $flags = sum(map(int, str($filter).split(\',\')))</token>\n+    <token name="@PREPARE_IDX@"><![CDATA[\n+        ##prepare input and indices \n+        ln -s \'$input\' infile &&\n+        #if $input.is_of_type(\'bam\'):\n+            #if str( $input.metadata.bam_index ) != "None":\n+                ln -s \'${input.metadata.bam_index}\' infile.bai &&\n+            #else:\n+                samtools index infile infile.bai &&\n+            #end if\n+        #elif $input.is_of_type(\'cram\'):\n+            #if str( $input.metadata.cram_index ) != "None":\n+                ln -s \'${input.metadata.cram_index}\' infile.crai &&\n+            #else:\n+                samtools index infile infile.crai &&\n+            #end if\n+        #end if\n+    ]]></token>\n+    <token name="@PREPARE_IDX_MULTIPLE@"><![CDATA[\n+        ##prepare input and indices \n+        #for $i, $bam in enumerate( $input_bams ):\n+            ln -s \'$bam\' \'${i}\' &&\n+            #if $bam.is_of_type(\'bam\'):\n+                #if str( $bam.metadata.bam_index ) != "None":\n+                    ln -s \'${bam.metadata.bam_index}\' \'${i}.bai\' &&\n+                #else:\n+                    samtools index \'${i}\' \'${i}.bai\' &&\n+                #end if\n+            #elif $bam.is_of_type(\'cram\'):\n+                #if str( $bam.metadata.cram_index ) != "None":\n+                    ln -s \'${bam.metadata.cram_index}\' \'${i}.crai\' &&\n+                #else:\n+                    samtools index \'${i}\' \'${i}.crai\' &&\n+                #end if\n+            #end if\n+        #end for\n+    ]]></token>\n+    <token name="@PREPARE_FASTA_IDX@"><![CDATA[\n+        ##checks for reference data ($addref_cond.addref_select=="history" or =="cached")\n+        ##and sets the -t/-T parameters accordingly:\n+        ##- in case of history a symbolic link is used because samtools (view) will generate\n+        ##  the index which might not be possible in the directory containing the fasta file\n+        ##- in case of cached the absolute path is used which allows to read the cram file\n+        ##  without specifying the reference\n+        #if $addref_cond.addref_select == "history":\n+            ln -s \'${addref_cond.ref}\' reference.fa &&\n+            samtools faidx reference.fa &&\n+            #set reffa="reference.fa"\n+            #set reffai="reference.fa.fai"\n+        #elif $addref_cond.addref_select == "cached":\n+            #set reffa=str($addref_cond.ref.fields.path)\n+            #set reffai=str($addref_cond.ref.fields.path)+".fai"\n+        #else\n+            #set reffa=None\n+            #set reffai=None\n+        #end if\n+    ]]></token>\n+    <token name="@ADDTHREADS@"><![CDATA[\n+        ##compute the number of ADDITIONAL threads to be used by samtools (-@)\n+        addthreads=\\${GALAXY_SLOTS:-1} && (( addthreads-- )) &&\n+    ]]></token>\n+    <token name="@ADDMEMORY@"><![CDATA[\n+        ##compute the number of memory available to samtools sort (-m)\n+        ##use only 75% of available: https://github.com/samtools/samtools/issues/831\n+        addmemory=\\${GALAXY_MEMORY_MB_PER_SLOT:-768} && \n+        ((addmemory=addmemory*75/100)) &&\n+    ]]></token>\n+    <xml name="seed_input">\n+       <param name="seed" type="integer" optional="True" label="Seed for random number generator" help="If empty a random seed is used." /> \n+    </xml>\n+    <xml name="flag_options">\n+        <option value="1">read is paired</option>\n+        <option value="2">read is mapped in a proper pair</option>\n+        <option value="4">read is unmapped</option>\n+        <option value="8">mate is unmapped</option>\n+        <option value="16">read reverse strand</option>\n+        <option value="32">mate reverse strand</option>\n+        <option value="64">read is the'..b'on>\n+        <option value="1024">read is a PCR or optical duplicate</option>\n+        <option value="2048">supplementary alignment</option>\n+    </xml>\n+\n+    <!-- region specification macros and tokens for tools that allow the specification \n+         of region by bed file / space separated list of regions -->\n+    <token name="@REGIONS_FILE@"><![CDATA[\n+        #if $cond_region.select_region == \'tab\':\n+            -t \'$cond_region.targetregions\'\n+        #end if\n+    ]]></token>\n+    <token name="@REGIONS_MANUAL@"><![CDATA[\n+        #if $cond_region.select_region == \'text\':\n+            #for $i, $x in enumerate($cond_region.regions_repeat):\n+               \'${x.region}\'\n+            #end for\n+        #end if\n+    ]]></token>\n+    <xml name="regions_macro">\n+        <conditional name="cond_region">\n+            <param name="select_region" type="select" label="Filter by regions" help="restricts output to only those alignments which overlap the specified region(s)">\n+                <option value="no" selected="True">No</option>\n+                <option value="text">Manualy specify regions</option>\n+                <option value="tab">Regions from tabular file</option>\n+            </param>\n+            <when value="no"/>\n+            <when value="text">\n+                <repeat name="regions_repeat" min="1" default="1" title="Regions">\n+                    <param name="region" type="text" label="region" help="format chr:from-to">\n+                        <validator type="regex" message="Required format: CHR[:FROM[-TO]]; where CHR: string containing any character except quotes, whitespace and colon; FROM and TO: any integer">^[^\\s\'\\":]+(:\\d+(-\\d+){0,1}){0,1}$</validator>\n+                    </param>\n+                </repeat>\n+            </when>\n+            <when value="tab">\n+                <param name="targetregions" argument="-t/--target-regions" type="data" format="tabular" label="Target regions file" help="Do stats in these regions only. Tab-delimited file chr,from,to (1-based, inclusive)" />\n+            </when>\n+        </conditional>\n+    </xml>\n+\n+    <xml name="citations">\n+        <citations>\n+            <citation type="bibtex">\n+                @misc{SAM_def,\n+                title={Definition of SAM/BAM format},\n+                url = {https://samtools.github.io/hts-specs/},}\n+            </citation>\n+            <citation type="doi">10.1093/bioinformatics/btp352</citation>\n+            <citation type="doi">10.1093/bioinformatics/btr076</citation>\n+            <citation type="doi">10.1093/bioinformatics/btr509</citation>\n+            <citation type="bibtex">\n+                @misc{Danecek_et_al,\n+                Author={Danecek, P., Schiffels, S., Durbin, R.},\n+                title={Multiallelic calling model in bcftools (-m)},\n+                url = {http://samtools.github.io/bcftools/call-m.pdf},}\n+            </citation>\n+            <citation type="bibtex">\n+                @misc{Durbin_VCQC,\n+                Author={Durbin, R.},\n+                title={Segregation based metric for variant call QC},\n+                url = {http://samtools.github.io/bcftools/rd-SegBias.pdf},}\n+            </citation>\n+            <citation type="bibtex">\n+                @misc{Li_SamMath,\n+                Author={Li, H.},\n+                title={Mathematical Notes on SAMtools Algorithms},\n+                url = {http://www.broadinstitute.org/gatk/media/docs/Samtools.pdf},}\n+            </citation>\n+            <citation type="bibtex">\n+                @misc{SamTools_github,\n+                title={SAMTools GitHub page},\n+                url = {https://github.com/samtools/samtools},}\n+            </citation>\n+        </citations>\n+    </xml>\n+    <xml name="version_command">\n+        <version_command><![CDATA[samtools 2>&1 | grep Version]]></version_command>\n+    </xml>\n+    <xml name="stdio">\n+        <stdio>\n+            <exit_code range="1:" level="fatal" description="Error" />\n+        </stdio>\n+    </xml>\n+</macros>\n'
b
diff -r 000000000000 -r 740ce0a18f0d samtools_merge.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/samtools_merge.xml Sun Oct 14 13:44:49 2018 -0400
[
@@ -0,0 +1,129 @@
+<tool id="samtools_merge" name="Samtools merge" version="@TOOL_VERSION@">
+    <description>merge multiple sorted alignment files</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements"/>
+    <expand macro="stdio"/>
+    <expand macro="version_command"/>
+    <command><![CDATA[
+@ADDTHREADS@
+
+## prepare input (sort sam, link other), determine input ordering 
+## and check for consistent input ordering
+#set sortby=''
+#for $i, $bam in enumerate( $bamfiles ):
+    #if $bam.is_of_type('sam', 'cram', ):
+        samtools sort
+        -@ \$addthreads -m \${GALAXY_MEMORY_MB:-768}M -T sorttemp
+        -O sam
+        -o ${i}.sam
+        '$bam' &&
+    #else:
+        ln -s '$bam' ${i}.sam &&
+    #end if
+
+    #if ($sortby=='coord' or $sortby=='') and $bam.is_of_type('sam','bam','cram'):
+        #set sortby='coord'
+    #else if ($sortby=='name' or $sortby=='') and $bam.is_of_type('qname_sorted.bam', 'qname_input_sorted.bam'):
+        #set $sortby='name'
+    #else:
+        >&2 echo "inconsistently sorted input" && 
+        exit 1 &&
+    #end if
+#end for
+
+samtools merge
+-@ \$addthreads
+-s $seed
+## TODO force overwrite seems necessay (but I do not understand why ...)
+-f 
+## Galaxy provides only default compression
+## #if $compression == 'levelone'
+##     -1
+## #else if $compression == 'uncompressed'
+##     -u
+## #end if
+#if str($headerbam) != 'None'
+    -h '$headerbam'
+#end if
+#if $sortby=='name'
+    -n
+#end if
+## TODO since galaxy can't represent this as data type at the moment this option is unsupported
+## -t TAG    The input alignments have been sorted by the value of TAG, then by either position or name (if -n is given). 
+#if str($region) != ''
+    -R '$region'
+#end if
+## Attach an RG tag to each alignment. The tag value is inferred from file names.
+## -r
+## TODO -r makes no sense with the link names, is there some data set metadata (tags,...) that could be used?
+$idrg
+$idpg
+$output
+#for $i, $bam in enumerate( $bamfiles ):
+    ${i}.sam
+#end for
+    ]]></command>
+    <inputs>
+        <param name="bamfiles" type="data" format="sam,bam,cram" multiple="true" optional="false" label="Alignments in BAM format" help="Sets of aligned reads." />
+        <param name="region" type="text" optional="true" argument="-n" label="Merge files in a region" help="Merge files in the specified region indicated by a string" />
+        <param name="headerbam" type="data" format="sam,bam" argument="-h" multiple="false" optional="true" label="File to take @headers from" help="Use the lines of FILE as `@' headers to be copied to out.bam, replacing any header lines that would otherwise be copied from in1.bam. (FILE is actually in SAM format, though any alignment records it may contain are ignored.)" />
+        <param name="idrg" type="boolean" argument="-c" truevalue="-c" falsevalue="" checked="false" label="Make @RG headers unique" help="When several input files contain @RG headers with the same ID, emit only one of them (namely, the header line from the first file we find that ID in) to the merged output file. Combining these similar headers is usually the right thing to do when the files being merged originated from the same file. Without -c, all @RG headers appear in the output file, with random suffices added to their IDs where necessary to differentiate them." />
+        <param name="idpg" type="boolean" argument="-p" truevalue="-p" falsevalue="" checked="false" label="Make @PG headers unique" help="Similarly, for each @PG ID in the set of files to merge, use the @PG line of the first file we find that ID in rather than adding a suffix to differentiate similar IDs." />
+        <param name="seed" type="integer" argument="-s" value="1" label="random seed" />
+        <!-- compression options are not accessible in galaxy
+        <param name="compression" type="select" label="Advanced compression options" help="Select advanced compression options">
+            <option value="default" selected="true">default</option>
+            <option value="levelone">Use zlib compression level 1</option>
+            <option value="uncompressed">Uncompressed output</option>
+        </param>-->
+    </inputs>
+    <outputs>
+        <data name="output" format="bam" />
+    </outputs>
+    <tests>
+        <!-- tests and data extracted from 
+             https://github.com/samtools/samtools/blob/9ce8c64493f7ea3fa69bc5c1ac980b1a8e3dcf1f/test/test.pl
+             https://github.com/samtools/samtools/tree/develop/test/merge -->
+        <!-- # Merge 1 - Standard 3 file SAM merge all presented on the command line (only checks for similar size, because generated header info differs) -->
+        <test>
+            <param name="bamfiles" value="test_input_1_a.sam,test_input_1_b.sam,test_input_1_c.sam" />
+            <output name="output" file="2.merge.expected.bam" compare="sim_size" delta="50" />
+        </test>
+        <!-- Merge 2 - Standard 3 file BAM merge all files presented on the command line -->
+        <test>
+            <param name="bamfiles" value="test_input_1_a.bam,test_input_1_b.bam,test_input_1_c.bam" />
+            <output name="output" file="2.merge.expected.bam" compare="sim_size" delta="50" />
+        </test>
+        <!-- Merge 4 - 1 file BAM merge with file presented on the command line -->
+        <test>
+            <param name="bamfiles" value="test_input_1_b.bam" />
+            <output name="output" file="4.merge.expected.bam" compare="sim_size" delta="50" />
+        </test>
+        <!--Merge 5 - 3 file SAM merge all presented on the command line override IDs to file names (not implemented in tool) -->
+        <!--Merge 6 - merge all presented on the command line, combine PG and RG rather than dedup -->
+        <test>
+            <param name="bamfiles" value="test_input_1_a.bam,test_input_1_b.bam" />
+            <param name="idrg" value="-c" />
+            <param name="idpg" value="-p" />
+            <output name="output" file="6.merge.expected.bam" compare="sim_size" delta="50" />
+        </test>
+        <!-- Merge 7 - ID and SN with regex in them (probably not necessary for the galaxy tool because just different input) -->
+        <test>
+            <param name="bamfiles" value="test_input_1_a_regex.sam,test_input_1_b_regex.sam" />
+            <output name="output" file="7.merge.expected.bam" compare="sim_size" delta="50" />
+        </test>
+        <!-- Sort inputs by PG, then merge (not implemented, since -t not supported in the tool) -->
+        <!-- Sort inputs by PG, then merge (name sorted) (not implemented, since -t not supported in the tool) -->
+    </tests>
+    <help>
+**What it does**
+Merge multiple sorted alignment files, producing a single sorted output file that contains all the input records and maintains the existing sort order.
+
+If a file to take @headers from is specified the @SQ headers of input files will be merged into the specified header, otherwise they will be merged into a composite header created from the input headers. If in the process of merging @SQ lines for coordinate sorted input files, a conflict arises as to the order (for example input1.bam has @SQ for a,b,c and input2.bam has b,a,c) then the resulting output file will need to be re-sorted back into coordinate order.
+
+Unless the @PG/@RG headers are made unique when merging @RG and @PG records into the output header then any IDs found to be duplicates of existing IDs in the output header will have a suffix appended to them to differentiate them from similar header records from other files and the read records will be updated to reflect this. 
+    </help>
+    <expand macro="citations"/>
+</tool>
b
diff -r 000000000000 -r 740ce0a18f0d test-data/2.merge.expected-samin.bam
b
Binary file test-data/2.merge.expected-samin.bam has changed
b
diff -r 000000000000 -r 740ce0a18f0d test-data/2.merge.expected.bam
b
Binary file test-data/2.merge.expected.bam has changed
b
diff -r 000000000000 -r 740ce0a18f0d test-data/2.merge.expected.sam
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/2.merge.expected.sam Sun Oct 14 13:44:49 2018 -0400
b
@@ -0,0 +1,67 @@
+@HD VN:1.4
+@SQ SN:insert LN:599
+@SQ SN:ref1 LN:45
+@SQ SN:ref2 LN:40
+@SQ SN:ref3 LN:4
+@RG ID:fish PG:donkey
+@RG ID:cow PU:13_&^&&*(:332
+@RG PU:*9u8jkjjkjd: ID:colt
+@RG ID:fish-55424A4 PG:llama
+@RG ID:cow-3A2CCEF5 PU:13_&^&&*(:332 PG:donkey-4861F4EF
+@RG PU:*9u8jkjjkjd: ID:colt-6ADB4A65
+@RG ID:fish-39E5EF
+@RG ID:cow-1802EEEC PU:13_&^&&*(:332
+@RG PU:*9u8jkjjkjd: ID:colt-7EC68B3F
+@PG ID:bull PP:donkey
+@PG ID:donkey
+@PG ID:moose
+@PG PP:moose ID:cow
+@PG ID:llama
+@PG ID:bull-2B019719 PP:donkey-4861F4EF
+@PG ID:donkey-4861F4EF
+@PG ID:bull-60104A41 PP:donkey-2EE20DF8
+@PG ID:donkey-2EE20DF8
+@CO
+@CO Do you know?
+@CO Do you know?
+@CO Another comment from test_input_1_c
+r000 99 insert 50 30 10M = 80 30 ATTTAGCTAC AAAAAAAAAA RG:Z:cow PG:Z:bull
+r000 211 insert 80 30 10M = 50 -30 CCCAATCATT AAAAAAAAAA RG:Z:cow PG:Z:bull
+r001 163 ref1 7 30 8M4I4M1D3M = 37 39 TTAGATAAAGAGGATACTG * XX:B:S,12561,2,20,112 YY:i:100 RG:Z:fish
+r005 163 ref1 7 30 8M4I4M1D3M = 37 39 TTAGATAAAGAGGATACTG * XX:B:S,12561,2,20,112 YY:i:100 RG:Z:colt-6ADB4A65 PG:Z:donkey-4861F4EF
+r008 163 ref1 7 30 8M4I4M1D3M = 37 39 TTAGATAAAGAGGATACTG * XX:B:S,12561,2,20,112 YY:i:100 RG:Z:colt-7EC68B3F PG:Z:donkey-2EE20DF8
+r002 0 ref1 9 30 1S2I6M1P1I1P1I4M2I * 0 0 AAAAGATAAGGGATAAA * XA:Z:abc XB:i:-10
+r003 0 ref1 9 30 5H6M * 0 0 AGCTAA * RG:Z:cow
+r006 0 ref1 9 30 1S2I6M1P1I1P1I4M2I * 0 0 AAAAGATAAGGGATAAA * XA:Z:abc XB:i:-10 RG:Z:colt-6ADB4A65 PG:Z:donkey-4861F4EF
+r007 0 ref1 9 30 5H6M * 0 0 AGCTAA * RG:Z:colt-6ADB4A65 PG:Z:donkey-4861F4EF
+r009 0 ref1 9 30 1S2I6M1P1I1P1I4M2I * 0 0 AAAAGATAAGGGATAAA * XA:Z:abc XB:i:-10 RG:Z:colt-7EC68B3F PG:Z:donkey-2EE20DF8
+r010 0 ref1 9 30 5H6M * 0 0 AGCTAA * RG:Z:colt-7EC68B3F PG:Z:donkey-2EE20DF8
+r004 0 ref1 16 30 6M14N1I5M * 0 0 ATAGCTCTCAGC * RG:Z:colt
+r007 0 ref1 16 30 6M14N1I5M * 0 0 ATAGCTCTCAGC * RG:Z:colt-6ADB4A65 PG:Z:donkey-4861F4EF
+r010 0 ref1 16 30 6M14N1I5M * 0 0 ATAGCTCTCAGC * RG:Z:colt-7EC68B3F PG:Z:donkey-2EE20DF8
+r003 16 ref1 29 30 6H5M * 0 0 TAGGC * RG:Z:cow
+r006 16 ref1 29 30 6H5M * 0 0 TAGGC * RG:Z:colt-6ADB4A65 PG:Z:donkey-4861F4EF
+r009 16 ref1 29 30 6H5M * 0 0 TAGGC * RG:Z:colt-7EC68B3F PG:Z:donkey-2EE20DF8
+r001 83 ref1 37 30 9M = 7 -39 CAGCGCCAT * RG:Z:fish
+r005 83 ref1 37 30 9M = 7 -39 CAGCGCCAT * RG:Z:colt-6ADB4A65 PG:Z:donkey-4861F4EF
+r008 83 ref1 37 30 9M = 7 -39 CAGCGCCAT * RG:Z:colt-7EC68B3F PG:Z:donkey-2EE20DF8
+x1 0 ref2 1 30 20M * 0 0 AGGTTTTATAAAACAAATAA * RG:Z:colt PG:Z:bull
+x7 0 ref2 1 30 20M * 0 0 AGGTTTTATAAAACAAATAA * RG:Z:cow-3A2CCEF5 PG:Z:bull-2B019719
+x10 0 ref2 1 30 20M * 0 0 AGGTTTTATAAAACAAATAA * RG:Z:cow-1802EEEC PG:Z:bull-60104A41
+x2 0 ref2 2 30 21M * 0 0 GGTTTTATAAAACAAATAATT ????????????????????? RG:Z:colt PG:Z:bull
+x8 0 ref2 2 30 21M * 0 0 GGTTTTATAAAACAAATAATT ????????????????????? RG:Z:cow-3A2CCEF5 PG:Z:bull-2B019719
+x11 0 ref2 2 30 21M * 0 0 GGTTTTATAAAACAAATAATT ????????????????????? RG:Z:cow-1802EEEC PG:Z:bull-60104A41
+x3 0 ref2 6 30 9M4I13M * 0 0 TTATAAAACAAATAATTAAGTCTACA ?????????????????????????? RG:Z:fish PG:Z:bull
+x9 0 ref2 6 30 9M4I13M * 0 0 TTATAAAACAAATAATTAAGTCTACA ?????????????????????????? RG:Z:cow-3A2CCEF5 PG:Z:bull-2B019719
+x12 0 ref2 6 30 9M4I13M * 0 0 TTATAAAACAAATAATTAAGTCTACA ?????????????????????????? RG:Z:cow-1802EEEC PG:Z:bull-60104A41
+x4 0 ref2 10 30 25M * 0 0 CAAATAATTAAGTCTACAGAGCAAC ????????????????????????? RG:Z:fish PG:Z:bull
+x10 0 ref2 10 30 25M * 0 0 CAAATAATTAAGTCTACAGAGCAAC ????????????????????????? RG:Z:cow-3A2CCEF5 PG:Z:bull-2B019719
+x13 0 ref2 10 30 25M * 0 0 CAAATAATTAAGTCTACAGAGCAAC ????????????????????????? RG:Z:cow-1802EEEC PG:Z:bull-60104A41
+x5 0 ref2 12 30 24M * 0 0 AATAATTAAGTCTACAGAGCAACT ???????????????????????? RG:Z:fish PG:Z:bull
+x11 0 ref2 12 30 24M * 0 0 AATAATTAAGTCTACAGAGCAACT ???????????????????????? RG:Z:cow-3A2CCEF5 PG:Z:bull-2B019719
+x14 0 ref2 12 30 24M * 0 0 AATAATTAAGTCTACAGAGCAACT ???????????????????????? RG:Z:cow-1802EEEC PG:Z:bull-60104A41
+x6 0 ref2 14 30 23M * 0 0 TAATTAAGTCTACAGAGCAACTA ??????????????????????? RG:Z:cow
+x12 0 ref2 14 30 23M * 0 0 TAATTAAGTCTACAGAGCAACTA ??????????????????????? RG:Z:cow-3A2CCEF5 PG:Z:bull-2B019719
+x15 0 ref2 14 30 23M * 0 0 TAATTAAGTCTACAGAGCAACTA ??????????????????????? RG:Z:cow-1802EEEC PG:Z:bull-60104A41
+u1 4 * 0 30 23M * 0 0 TAATTAAGTCTACAGAAAAAAAA ???????????????????????
+u2 4 * 0 30 * * 0 0 TAATTAAGTCTACAGAAAAAAAA ???????????????????????
b
diff -r 000000000000 -r 740ce0a18f0d test-data/4.merge.expected.bam
b
Binary file test-data/4.merge.expected.bam has changed
b
diff -r 000000000000 -r 740ce0a18f0d test-data/6.merge.expected.bam
b
Binary file test-data/6.merge.expected.bam has changed
b
diff -r 000000000000 -r 740ce0a18f0d test-data/7.merge.expected.bam
b
Binary file test-data/7.merge.expected.bam has changed
b
diff -r 000000000000 -r 740ce0a18f0d test-data/test_input_1_a.bam
b
Binary file test-data/test_input_1_a.bam has changed
b
diff -r 000000000000 -r 740ce0a18f0d test-data/test_input_1_a.sam
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_input_1_a.sam Sun Oct 14 13:44:49 2018 -0400
b
@@ -0,0 +1,28 @@
+@HD VN:1.4
+@SQ SN:insert LN:599
+@SQ SN:ref1 LN:45
+@SQ SN:ref2 LN:40
+@SQ SN:ref3 LN:4
+@RG ID:fish PG:donkey
+@RG ID:cow PU:13_&^&&*(:332
+@RG PU:*9u8jkjjkjd: ID:colt
+@PG ID:bull PP:donkey
+@PG ID:donkey
+@PG ID:moose
+@PG PP:moose ID:cow
+@CO
+r000 99 insert 50 30 10M = 80 30 ATTTAGCTAC AAAAAAAAAA RG:Z:cow PG:Z:bull
+r000 211 insert 80 30 10M = 50 -30 CCCAATCATT AAAAAAAAAA RG:Z:cow PG:Z:bull
+r001 163 ref1 7 30 8M4I4M1D3M = 37 39 TTAGATAAAGAGGATACTG * XX:B:S,12561,2,20,112 YY:i:100 RG:Z:fish PG:Z:donkey
+r002 0 ref1 9 30 1S2I6M1P1I1P1I4M2I * 0 0 AAAAGATAAGGGATAAA * XA:Z:abc XB:i:-10 PG:Z:donkey
+r003 0 ref1 9 30 5H6M * 0 0 AGCTAA * RG:Z:cow
+r004 0 ref1 16 30 6M14N1I5M * 0 0 ATAGCTCTCAGC * RG:Z:colt PG:Z:donkey
+r003 16 ref1 29 30 6H5M * 0 0 TAGGC * RG:Z:cow PG:Z:donkey
+r001 83 ref1 37 30 9M = 7 -39 CAGCGCCAT * RG:Z:fish PG:Z:donkey
+x1 0 ref2 1 30 20M * 0 0 AGGTTTTATAAAACAAATAA * RG:Z:colt PG:Z:bull
+x2 0 ref2 2 30 21M * 0 0 GGTTTTATAAAACAAATAATT ????????????????????? RG:Z:colt PG:Z:bull
+x3 0 ref2 6 30 9M4I13M * 0 0 TTATAAAACAAATAATTAAGTCTACA ?????????????????????????? RG:Z:fish PG:Z:bull
+x4 0 ref2 10 30 25M * 0 0 CAAATAATTAAGTCTACAGAGCAAC ????????????????????????? RG:Z:fish PG:Z:bull
+x5 0 ref2 12 30 24M * 0 0 AATAATTAAGTCTACAGAGCAACT ???????????????????????? RG:Z:fish PG:Z:bull
+x6 0 ref2 14 30 23M * 0 0 TAATTAAGTCTACAGAGCAACTA ??????????????????????? RG:Z:cow
+u1 4 * 0 30 23M * 0 0 TAATTAAGTCTACAGAAAAAAAA ???????????????????????
b
diff -r 000000000000 -r 740ce0a18f0d test-data/test_input_1_a_regex.sam
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_input_1_a_regex.sam Sun Oct 14 13:44:49 2018 -0400
b
@@ -0,0 +1,28 @@
+@HD VN:1.4
+@SQ SN:insert LN:599
+@SQ SN:ref1|this=that LN:45
+@SQ SN:ref2*HLA:1a:2:b LN:40
+@SQ SN:ref3 LN:4
+@RG ID:fish PG:donkey
+@RG ID:cow PU:13_&^&&*(:332
+@RG PU:*9u8jkjjkjd: ID:colt
+@PG ID:bull PP:donkey
+@PG ID:donkey
+@PG ID:moose
+@PG PP:moose ID:cow
+@CO
+r000 99 insert 50 30 10M = 80 30 ATTTAGCTAC AAAAAAAAAA RG:Z:cow PG:Z:bull
+r000 211 insert 80 30 10M = 50 -30 CCCAATCATT AAAAAAAAAA RG:Z:cow PG:Z:bull
+r001 163 ref1|this=that 7 30 8M4I4M1D3M = 37 39 TTAGATAAAGAGGATACTG * XX:B:S,12561,2,20,112 YY:i:100 RG:Z:fish PG:Z:colt
+r002 0 ref1|this=that 9 30 1S2I6M1P1I1P1I4M2I * 0 0 AAAAGATAAGGGATAAA * XA:Z:abc XB:i:-10 PG:Z:colt
+r003 0 ref1|this=that 9 30 5H6M * 0 0 AGCTAA * RG:Z:cow
+r004 0 ref1|this=that 16 30 6M14N1I5M * 0 0 ATAGCTCTCAGC * RG:Z:colt PG:Z:colt
+r003 16 ref1|this=that 29 30 6H5M * 0 0 TAGGC * RG:Z:cow PG:Z:colt
+r001 83 ref1|this=that 37 30 9M = 7 -39 CAGCGCCAT * RG:Z:fish PG:Z:colt
+x1 0 ref2*HLA:1a:2:b 1 30 20M * 0 0 AGGTTTTATAAAACAAATAA * RG:Z:colt PG:Z:bull
+x2 0 ref2*HLA:1a:2:b 2 30 21M * 0 0 GGTTTTATAAAACAAATAATT ????????????????????? RG:Z:colt PG:Z:bull
+x3 0 ref2*HLA:1a:2:b 6 30 9M4I13M * 0 0 TTATAAAACAAATAATTAAGTCTACA ?????????????????????????? RG:Z:fish PG:Z:bull
+x4 0 ref2*HLA:1a:2:b 10 30 25M * 0 0 CAAATAATTAAGTCTACAGAGCAAC ????????????????????????? RG:Z:fish PG:Z:bull
+x5 0 ref2*HLA:1a:2:b 12 30 24M * 0 0 AATAATTAAGTCTACAGAGCAACT ???????????????????????? RG:Z:fish PG:Z:bull
+x6 0 ref2*HLA:1a:2:b 14 30 23M * 0 0 TAATTAAGTCTACAGAGCAACTA ??????????????????????? RG:Z:cow
+u1 4 * 0 30 23M * 0 0 TAATTAAGTCTACAGAAAAAAAA ???????????????????????
b
diff -r 000000000000 -r 740ce0a18f0d test-data/test_input_1_b.bam
b
Binary file test-data/test_input_1_b.bam has changed
b
diff -r 000000000000 -r 740ce0a18f0d test-data/test_input_1_b.sam
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_input_1_b.sam Sun Oct 14 13:44:49 2018 -0400
b
@@ -0,0 +1,24 @@
+@HD VN:1.4
+@SQ SN:insert LN:599
+@SQ SN:ref1 LN:45
+@SQ SN:ref2 LN:40
+@SQ SN:ref3 LN:4
+@PG ID:llama
+@RG ID:fish PG:llama
+@RG ID:cow PU:13_&^&&*(:332 PG:donkey
+@RG PU:*9u8jkjjkjd: ID:colt
+@PG ID:bull PP:donkey
+@PG ID:donkey
+@CO Do you know?
+r005 163 ref1 7 30 8M4I4M1D3M = 37 39 TTAGATAAAGAGGATACTG * XX:B:S,12561,2,20,112 YY:i:100 RG:Z:colt PG:Z:donkey
+r006 0 ref1 9 30 1S2I6M1P1I1P1I4M2I * 0 0 AAAAGATAAGGGATAAA * XA:Z:abc XB:i:-10 RG:Z:colt PG:Z:donkey
+r007 0 ref1 9 30 5H6M * 0 0 AGCTAA * RG:Z:colt PG:Z:donkey
+r007 0 ref1 16 30 6M14N1I5M * 0 0 ATAGCTCTCAGC * RG:Z:colt PG:Z:donkey
+r006 16 ref1 29 30 6H5M * 0 0 TAGGC * RG:Z:colt PG:Z:donkey
+r005 83 ref1 37 30 9M = 7 -39 CAGCGCCAT * RG:Z:colt PG:Z:donkey
+x7 0 ref2 1 30 20M * 0 0 AGGTTTTATAAAACAAATAA * RG:Z:cow PG:Z:bull
+x8 0 ref2 2 30 21M * 0 0 GGTTTTATAAAACAAATAATT ????????????????????? RG:Z:cow PG:Z:bull
+x9 0 ref2 6 30 9M4I13M * 0 0 TTATAAAACAAATAATTAAGTCTACA ?????????????????????????? RG:Z:cow PG:Z:bull
+x10 0 ref2 10 30 25M * 0 0 CAAATAATTAAGTCTACAGAGCAAC ????????????????????????? RG:Z:cow PG:Z:bull
+x11 0 ref2 12 30 24M * 0 0 AATAATTAAGTCTACAGAGCAACT ???????????????????????? RG:Z:cow PG:Z:bull
+x12 0 ref2 14 30 23M * 0 0 TAATTAAGTCTACAGAGCAACTA ??????????????????????? RG:Z:cow PG:Z:bull
b
diff -r 000000000000 -r 740ce0a18f0d test-data/test_input_1_b_regex.sam
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_input_1_b_regex.sam Sun Oct 14 13:44:49 2018 -0400
[
@@ -0,0 +1,24 @@
+@HD VN:1.4
+@SQ SN:insert LN:599
+@SQ SN:ref2*HLA:1a:2:b LN:40
+@SQ SN:ref3 LN:4
+@SQ SN:ref1 LN:45
+@PG ID:llama_{a}
+@RG ID:fish-[1] PG:llama_{a}
+@RG ID:cow-[2] PU:13_&^&&*(:332 PG:donkey
+@RG PU:*9u8jkjjkjd: ID:colt
+@PG ID:bull PP:donkey
+@PG ID:donkey
+@CO Do you know?
+x7 0 ref2*HLA:1a:2:b 1 30 20M * 0 0 AGGTTTTATAAAACAAATAA * RG:Z:cow-[2] PG:Z:bull
+x8 0 ref2*HLA:1a:2:b 2 30 21M * 0 0 GGTTTTATAAAACAAATAATT ????????????????????? RG:Z:cow-[2] PG:Z:bull
+x9 0 ref2*HLA:1a:2:b 6 30 9M4I13M * 0 0 TTATAAAACAAATAATTAAGTCTACA ?????????????????????????? RG:Z:cow-[2] PG:Z:bull
+x10 0 ref2*HLA:1a:2:b 10 30 25M * 0 0 CAAATAATTAAGTCTACAGAGCAAC ????????????????????????? RG:Z:cow-[2] PG:Z:bull
+x11 0 ref2*HLA:1a:2:b 12 30 24M * 0 0 AATAATTAAGTCTACAGAGCAACT ???????????????????????? RG:Z:cow-[2] PG:Z:bull
+x12 0 ref2*HLA:1a:2:b 14 30 23M * 0 0 TAATTAAGTCTACAGAGCAACTA ??????????????????????? RG:Z:cow-[2] PG:Z:bull
+r005 163 ref1 7 30 8M4I4M1D3M = 37 39 TTAGATAAAGAGGATACTG * XX:B:S,12561,2,20,112 YY:i:100 RG:Z:colt PG:Z:donkey
+r006 0 ref1 9 30 1S2I6M1P1I1P1I4M2I * 0 0 AAAAGATAAGGGATAAA * XA:Z:abc XB:i:-10 RG:Z:colt PG:Z:donkey
+r007 0 ref1 9 30 5H6M * 0 0 AGCTAA * RG:Z:colt PG:Z:donkey
+r007 0 ref1 16 30 6M14N1I5M * 0 0 ATAGCTCTCAGC * RG:Z:colt PG:Z:donkey
+r006 16 ref1 29 30 6H5M * 0 0 TAGGC * RG:Z:colt PG:Z:donkey
+r005 83 ref1 37 30 9M = 7 -39 CAGCGCCAT * RG:Z:colt PG:Z:donkey
b
diff -r 000000000000 -r 740ce0a18f0d test-data/test_input_1_c.bam
b
Binary file test-data/test_input_1_c.bam has changed
b
diff -r 000000000000 -r 740ce0a18f0d test-data/test_input_1_c.sam
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_input_1_c.sam Sun Oct 14 13:44:49 2018 -0400
b
@@ -0,0 +1,23 @@
+@HD VN:1.4
+@SQ SN:ref1 LN:45
+@SQ SN:ref2 LN:40
+@RG ID:fish
+@RG ID:cow PU:13_&^&&*(:332
+@RG PU:*9u8jkjjkjd: ID:colt
+@PG ID:bull PP:donkey
+@PG ID:donkey
+@CO Do you know?
+@CO Another comment from test_input_1_c
+r008 163 ref1 7 30 8M4I4M1D3M = 37 39 TTAGATAAAGAGGATACTG * XX:B:S,12561,2,20,112 YY:i:100 RG:Z:colt PG:Z:donkey
+r009 0 ref1 9 30 1S2I6M1P1I1P1I4M2I * 0 0 AAAAGATAAGGGATAAA * XA:Z:abc XB:i:-10 RG:Z:colt PG:Z:donkey
+r010 0 ref1 9 30 5H6M * 0 0 AGCTAA * RG:Z:colt PG:Z:donkey
+r010 0 ref1 16 30 6M14N1I5M * 0 0 ATAGCTCTCAGC * RG:Z:colt PG:Z:donkey
+r009 16 ref1 29 30 6H5M * 0 0 TAGGC * RG:Z:colt PG:Z:donkey
+r008 83 ref1 37 30 9M = 7 -39 CAGCGCCAT * RG:Z:colt PG:Z:donkey
+x10 0 ref2 1 30 20M * 0 0 AGGTTTTATAAAACAAATAA * RG:Z:cow PG:Z:bull
+x11 0 ref2 2 30 21M * 0 0 GGTTTTATAAAACAAATAATT ????????????????????? RG:Z:cow PG:Z:bull
+x12 0 ref2 6 30 9M4I13M * 0 0 TTATAAAACAAATAATTAAGTCTACA ?????????????????????????? RG:Z:cow PG:Z:bull
+x13 0 ref2 10 30 25M * 0 0 CAAATAATTAAGTCTACAGAGCAAC ????????????????????????? RG:Z:cow PG:Z:bull
+x14 0 ref2 12 30 24M * 0 0 AATAATTAAGTCTACAGAGCAACT ???????????????????????? RG:Z:cow PG:Z:bull
+x15 0 ref2 14 30 23M * 0 0 TAATTAAGTCTACAGAGCAACTA ??????????????????????? RG:Z:cow PG:Z:bull
+u2 4 * 0 30 * * 0 0 TAATTAAGTCTACAGAAAAAAAA ???????????????????????