Mercurial > repos > iuc > umi_tools_group

--- a/macros.xml	Tue Aug 29 17:37:21 2017 -0400
+++ b/macros.xml	Wed Jan 10 19:09:28 2018 -0500
@@ -19,5 +19,5 @@
             <yield />
         </requirements>
     </xml>
-    <token name="@VERSION@">0.5.0</token>
+    <token name="@VERSION@">0.5.3</token>
 </macros>
Binary file test-data/dedup_out1.bam has changed
Binary file test-data/dedup_out2.bam has changed
Binary file test-data/dedup_out3.bam has changed
Binary file test-data/dedup_out4.bam has changed
Binary file test-data/dedup_out5.bam has changed
Binary file test-data/dedup_out6.bam has changed
Binary file test-data/group_out1.bam has changed
Binary file test-data/group_out4.bam has changed
--- a/test-data/out_paired.log	Tue Aug 29 17:37:21 2017 -0400
+++ b/test-data/out_paired.log	Wed Jan 10 19:09:28 2018 -0500
@@ -1,6 +1,6 @@
-# output generated by extract --bc-pattern=NNNXXX --stdin=input_read1.gz --read2-in=input_read2.gz --stdout out1.gz --read2-out=out2.gz --log=/tmp/tmpREDhtd/files/003/dataset_3773.dat
-# job started at Tue Aug 29 14:20:44 2017 on gaius -- 5913ff4c-b1d1-4f65-abec-66ca6684bdbd
-# pid: 21437, system: Linux 4.10.0-33-generic #37-Ubuntu SMP Fri Aug 11 10:55:28 UTC 2017 x86_64
+# output generated by extract --bc-pattern=NNNXXX --stdin=input_read1.gz --read2-in=input_read2.gz --stdout out1.gz --read2-out=out2.gz --log=/tmp/tmpAvWtMd/files/000/dataset_20.dat
+# job started at Wed Jan  3 17:06:39 2018 on tzk-ThinkPad-T450s -- 9eecaab6-9562-4b29-8a2e-ae6c1034a14c
+# pid: 22903, system: Linux 4.10.0-42-generic #46~16.04.1-Ubuntu SMP Mon Dec 4 15:57:59 UTC 2017 x86_64
 # blacklist                               : None
 # compresslevel                           : 6
 # error_correct_cell                      : False
@@ -21,16 +21,17 @@
 # read2_stdout                            : False
 # reads_subset                            : None
 # reconcile                               : False
+# retain_umi                              : None
 # short_help                              : None
 # stderr                                  : <_io.TextIOWrapper name='<stderr>' mode='w' encoding='UTF-8'>
 # stdin                                   : <_io.TextIOWrapper name='input_read1.gz' encoding='ascii'>
-# stdlog                                  : <_io.TextIOWrapper name='/tmp/tmpREDhtd/files/003/dataset_3773.dat' mode='a' encoding='UTF-8'>
+# stdlog                                  : <_io.TextIOWrapper name='/tmp/tmpAvWtMd/files/000/dataset_20.dat' mode='a' encoding='UTF-8'>
 # stdout                                  : <_io.TextIOWrapper name='out1.gz' encoding='ascii'>
 # timeit_file                             : None
 # timeit_header                           : None
 # timeit_name                             : all
 # whitelist                               : None
-2017-08-29 14:20:44,451 INFO Starting barcode extraction
-2017-08-29 14:20:44,461 INFO Input Reads: 100
-2017-08-29 14:20:44,461 INFO Reads output: 100
-# job finished in 0 seconds at Tue Aug 29 14:20:44 2017 --  0.73  0.05  0.00  0.00 -- 5913ff4c-b1d1-4f65-abec-66ca6684bdbd
+2018-01-03 17:06:39,365 INFO Starting barcode extraction
+2018-01-03 17:06:39,377 INFO Input Reads: 100
+2018-01-03 17:06:39,377 INFO Reads output: 100
+# job finished in 0 seconds at Wed Jan  3 17:06:39 2018 --  0.60  0.05  0.00  0.00 -- 9eecaab6-9562-4b29-8a2e-ae6c1034a14c
--- a/test-data/out_single.log	Tue Aug 29 17:37:21 2017 -0400
+++ b/test-data/out_single.log	Wed Jan 10 19:09:28 2018 -0500
@@ -1,6 +1,6 @@
-# output generated by extract --bc-pattern=XXXNNN --stdin=/tmp/tmpREDhtd/files/003/dataset_3766.dat --stdout /tmp/tmpREDhtd/files/003/dataset_3767.dat --3prime --quality-filter-threshold 10 --quality-encoding phred33 --log=/tmp/tmpREDhtd/files/003/dataset_3768.dat
-# job started at Tue Aug 29 14:20:26 2017 on gaius -- 1cfd46e7-5d74-4c25-8cdd-89cf8f41a393
-# pid: 21338, system: Linux 4.10.0-33-generic #37-Ubuntu SMP Fri Aug 11 10:55:28 UTC 2017 x86_64
+# output generated by extract --bc-pattern=XXXNNN --stdin=/tmp/tmpAvWtMd/files/000/dataset_13.dat --stdout /tmp/tmpAvWtMd/files/000/dataset_14.dat --3prime --quality-filter-threshold 10 --quality-encoding phred33 --log=/tmp/tmpAvWtMd/files/000/dataset_15.dat
+# job started at Wed Jan  3 17:06:20 2018 on tzk-ThinkPad-T450s -- 8897b3b4-c6e2-4ee3-b6c0-71044a63b7e1
+# pid: 22794, system: Linux 4.10.0-42-generic #46~16.04.1-Ubuntu SMP Mon Dec 4 15:57:59 UTC 2017 x86_64
 # blacklist                               : None
 # compresslevel                           : 6
 # error_correct_cell                      : False
@@ -21,17 +21,18 @@
 # read2_stdout                            : False
 # reads_subset                            : None
 # reconcile                               : False
+# retain_umi                              : None
 # short_help                              : None
 # stderr                                  : <_io.TextIOWrapper name='<stderr>' mode='w' encoding='UTF-8'>
-# stdin                                   : <_io.TextIOWrapper name='/tmp/tmpREDhtd/files/003/dataset_3766.dat' mode='r' encoding='UTF-8'>
-# stdlog                                  : <_io.TextIOWrapper name='/tmp/tmpREDhtd/files/003/dataset_3768.dat' mode='a' encoding='UTF-8'>
-# stdout                                  : <_io.TextIOWrapper name='/tmp/tmpREDhtd/files/003/dataset_3767.dat' mode='w' encoding='UTF-8'>
+# stdin                                   : <_io.TextIOWrapper name='/tmp/tmpAvWtMd/files/000/dataset_13.dat' mode='r' encoding='UTF-8'>
+# stdlog                                  : <_io.TextIOWrapper name='/tmp/tmpAvWtMd/files/000/dataset_15.dat' mode='a' encoding='UTF-8'>
+# stdout                                  : <_io.TextIOWrapper name='/tmp/tmpAvWtMd/files/000/dataset_14.dat' mode='w' encoding='UTF-8'>
 # timeit_file                             : None
 # timeit_header                           : None
 # timeit_name                             : all
 # whitelist                               : None
-2017-08-29 14:20:26,493 INFO Starting barcode extraction
-2017-08-29 14:20:26,495 INFO Input Reads: 100
-2017-08-29 14:20:26,495 INFO Reads output: 72
-2017-08-29 14:20:26,495 INFO filtered: umi quality: 28
-# job finished in 0 seconds at Tue Aug 29 14:20:26 2017 --  0.75  0.02  0.00  0.00 -- 1cfd46e7-5d74-4c25-8cdd-89cf8f41a393
+2018-01-03 17:06:20,360 INFO Starting barcode extraction
+2018-01-03 17:06:20,362 INFO Input Reads: 100
+2018-01-03 17:06:20,362 INFO Reads output: 72
+2018-01-03 17:06:20,362 INFO filtered: umi quality: 28
+# job finished in 0 seconds at Wed Jan  3 17:06:20 2018 --  0.58  0.06  0.00  0.00 -- 8897b3b4-c6e2-4ee3-b6c0-71044a63b7e1
--- a/umi-tools_group.xml	Tue Aug 29 17:37:21 2017 -0400
+++ b/umi-tools_group.xml	Wed Jan 10 19:09:28 2018 -0500
@@ -4,7 +4,7 @@
         <import>macros.xml</import>
     </macros>
     <expand macro="requirements">
-        <requirement type="package" version="1.5">samtools</requirement>
+        <requirement type="package" version="1.6">samtools</requirement>
     </expand>
     <command detect_errors="exit_code"><![CDATA[
         #if $input.is_of_type("sam"):
@@ -15,9 +15,11 @@
             #set $input_file = 'input.bam'
         #end if

-        umi_tools group --extract-umi-method $extract_umi_method
+        umi_tools group
+            --random-seed 0
+            --extract-umi-method $extract_umi_method
             #if str($extract_umi_method) != 'read_id':
-                --umi-separator '$umi_separator' --umi-tag $umi_tag
+                --umi-separator '$umi_separator' --umi-tag '$umi_tag'
             #end if
             --method $method --edit-distance-threshold $edit_distance_threshold
             $paired $spliced_is_unique --soft-clip-threshold $soft_clip_threshold
@@ -50,7 +52,7 @@
         <param argument="--method" type="select" label="Method used to identify PCR duplicates within reads." help="All methods start by identifying the reads with the same mapping position">
             <option value="unique">Reads group share the exact same UMI</option>
             <option value="cluster">Identify clusters based on hamming distance</option>
-            <option value="directional">Identify clusters based on distance and counts</option>
+            <option value="directional">Identify clusters based on distance and counts, restrict network expansion by threshold</option>
         </param>
         <param name="edit_distance_threshold" argument="--edit-distance-threshold" type="integer" value="1" label="Edit distance threshold" help="For the adjacency and cluster methods the threshold for the edit distance to connect two UMIs in the network can be increased. The default value of 1 works best unless the UMI is very long (&gt;14bp)" />
         <param argument="--paired" type="boolean" truevalue="--paired" falsevalue="" label="BAM is paired end" help="This will also force the use of the template length to determine reads with the same mapping coordinates." />
@@ -58,7 +60,7 @@
         <param name="soft_clip_threshold" argument="--soft-clip-threshold" type="integer" value="4" label="Soft clip threshold" help="Mappers that soft clip, will sometimes do so rather than mapping a spliced read if there is only a small overhang over the exon junction. By setting this option, you can treat reads with at least this many bases soft-clipped at the 3' end as spliced." />
         <param name="read_length" argument="--read-length" type="boolean" truevalue="--read-length" falsevalue="" label="Use the read length as as a criterion when deduping" />
         <param name="whole_contig" argument="--whole-contig" type="boolean" truevalue="--whole-contig" falsevalue="" label="Consider all alignments to a single contig together" help="This is useful if you have aligned to a transcriptome multi-fasta" />
-        <param argument="--subset" type="float" min="0" max="1" value="1" label="Only consider a random selection of the reads" />
+        <param argument="--subset" type="float" min="0.0" max="1.0" value="1.0" label="Only consider a random selection of the reads" />
         <param argument="--chrom" type="boolean" truevalue="--chrom" falsevalue="" label="Only consider a single chromosome" />
         <param name="per_contig" argument="--per-contig" type="boolean" truevalue="--per-contig" falsevalue="" label="Deduplicate per contig" help="Field 3 in BAM; RNAME. All reads with the same contig will be considered to have the same alignment position. This is useful if your library prep generates PCR duplicates with non identical alignment positions such as CEL-Seq. In this case, you would align to a reference transcriptome with one transcript per gene" />
         <param name="per_gene" argument="--per-gene" type="boolean" truevalue="--per-gene" falsevalue="" label="Deduplicate per gene" help="As above except with this option you can align to a reference transcriptome with more than one transcript per gene. You need to also provide a map of genes to transcripts. This will also add a metacontig ('MC') tag to the output BAM file." />
@@ -73,17 +75,11 @@
     </outputs>
     <tests>
         <test>
-            <param name="input" value="group_in1.sam" ftype="sam" />
-            <param name="extract_umi_method" value="read_id" />
-            <param name="method" value="unique" />
-            <output name="output" file="group_out1.bam" />
-        </test>
-        <test>
             <param name="input" value="group_in2.bam" ftype="bam" />
             <param name="extract_umi_method" value="read_id" />
             <param name="paired" value="True" />
             <param name="method" value="unique" />
-            <output name="output" file="group_out2.bam" />
+            <output name="output" file="group_out2.bam" ftype="bam" sort="True" />
         </test>
         <test>
             <param name="input" value="group_in3.bam" ftype="bam" />
@@ -91,7 +87,7 @@
             <param name="group_output" value="True" />
             <param name="method" value="unique" />
             <output name="group_out" file="group_out3.tab" />
-            <output name="output" file="group_out3.bam" />
+            <output name="output" file="group_out3.bam" ftype="bam" sort="True" />
         </test>
         <test>
             <param name="input" value="group_in4.bam" ftype="bam" />
@@ -99,21 +95,21 @@
             <param name="umi_tag" value="BX" />
             <param name="method" value="unique" />
             <output name="group_out" file="group_out4.tab" />
-            <output name="output" file="group_out4.bam" />
+            <output name="output" file="group_out4.bam" ftype="bam" sort="True" />
         </test>
         <test>
             <param name="input" value="group_in5.bam" ftype="bam" />
             <param name="extract_umi_method" value="read_id" />
             <param name="umi_tag" value="BX" />
             <param name="method" value="cluster" />
-            <output name="output" file="group_out5.bam" />
+            <output name="output" file="group_out5.bam" ftype="bam" sort="True" />
         </test>
         <test>
             <param name="input" value="group_in6.bam" ftype="bam" />
             <param name="extract_umi_method" value="read_id" />
             <param name="umi_tag" value="BX" />
             <param name="method" value="directional" />
-            <output name="output" file="group_out6.bam" />
+            <output name="output" file="group_out6.bam" ftype="bam" sort="True" />
         </test>
     </tests>
     <help><![CDATA[