Repository 'umi_tools_whitelist'
hg clone https://toolshed.g2.bx.psu.edu/repos/iuc/umi_tools_whitelist

Changeset 10:3adbf2fa0928 (2019-08-27)
Previous changeset 9:0c721837cbcf (2018-07-20) Next changeset 11:b9e787edcbbd (2019-09-10)
Commit message:
"planemo upload commit 28e58376e1d70e38276873a7d5e2ab44db88c2c0"
modified:
macros.xml
test-data/out_wl_single.txt
umi-tools_whitelist.xml
added:
test-data/out_wl_user.single.html
test-data/out_wl_user.single.tresh.tab
test-data/out_wl_user.single.txt
test-data/testYYY.40k.fastq.gz
b
diff -r 0c721837cbcf -r 3adbf2fa0928 macros.xml
--- a/macros.xml Fri Jul 20 03:49:25 2018 -0400
+++ b/macros.xml Tue Aug 27 17:11:52 2019 -0400
[
@@ -1,5 +1,23 @@
 <?xml version="1.0"?>
 <macros>
+    <macro name="barcode_sanitizer" >
+        <sanitizer invalid_char="">
+            <valid initial="string.letters,string.digits">
+                <add value="&#40;" /><!-- left bracket -->
+                <add value="&#41;" /><!-- right bracket -->
+                <add value="&#42;" /><!-- asterisk -->
+                <add value="&#44;" /><!-- comma -->
+                <add value="&#46;" /><!-- period -->
+                <add value="&#60;" /><!-- less than -->
+                <add value="&#61;" /><!-- equals sign -->
+                <add value="&#62;" /><!-- greater than -->
+                <add value="&#63;" /><!-- question mark -->
+                <add value="&#95;" /><!-- underscore -->
+                <add value="&#123;"/><!-- left brace -->
+                <add value="&#125;"/><!-- right brace -->
+            </valid>
+        </sanitizer>
+    </macro>
     <macro name="barcode2_conditional" >
         <conditional name="barcode">
             <param name="barcode_select" argument="--split-barcode" type="select" label="Barcode on both reads?">
@@ -11,6 +29,7 @@
                 <param name="bc_pattern2" argument="--bc-pattern2" type="text" value="" label="Barcode pattern for second read"
                        help="Use this option to specify the format of the UMI/barcode for
                              the second read pair if required." >
+                    <expand macro="barcode_sanitizer" />
                 </param>
             </when>
         </conditional>
@@ -55,7 +74,7 @@
             <yield />
         </requirements>
     </xml>
-    <token name="@VERSION@">0.5.3</token>
+    <token name="@VERSION@">0.5.5</token>
     <token name="@COMMAND_LINK@"><![CDATA[
         #set $gz = False
         #if $input_type.type == 'single':
b
diff -r 0c721837cbcf -r 3adbf2fa0928 test-data/out_wl_single.txt
--- a/test-data/out_wl_single.txt Fri Jul 20 03:49:25 2018 -0400
+++ b/test-data/out_wl_single.txt Tue Aug 27 17:11:52 2019 -0400
b
@@ -1,44 +1,4 @@
-# output generated by whitelist --bc-pattern=CCCCCCCCNNNNNNNN --subset-reads=0 --stdin=/tmp/tmpibtvD6/files/000/dataset_1.dat --method=reads --plot-prefix=OUT --3prime
-# job started at Sun Feb 25 10:49:56 2018 on bag -- cb0db520-8a4e-4040-aa88-93efc0718fa8
-# pid: 2217, system: Linux 4.13.0-32-generic #35-Ubuntu SMP Thu Jan 25 09:13:46 UTC 2018 x86_64
-# blacklist_tsv                           : None
-# cell_number                             : False
-# compresslevel                           : 6
-# error_correct_threshold                 : 1
-# expect_cells                            : False
-# extract_method                          : string
-# filter_cell_barcodes                    : False
-# log2stderr                              : False
-# loglevel                                : 1
-# method                                  : reads
-# pattern                                 : CCCCCCCCNNNNNNNN
-# pattern2                                : None
-# plot_prefix                             : OUT
-# prime3                                  : True
-# random_seed                             : None
-# read2_in                                : None
-# short_help                              : None
-# stderr                                  : <_io.TextIOWrapper name='<stderr>' mode='w' encoding='UTF-8'>
-# stdin                                   : <_io.TextIOWrapper name='/tmp/tmpibtvD6/files/000/dataset_1.dat' mode='r' encoding='UTF-8'>
-# stdlog                                  : <_io.TextIOWrapper name='<stdout>' mode='w' encoding='UTF-8'>
-# stdout                                  : <_io.TextIOWrapper name='<stdout>' mode='w' encoding='UTF-8'>
-# subset_reads                            : 0
-# timeit_file                             : None
-# timeit_header                           : None
-# timeit_name                             : all
-# whitelist_tsv                           : None
-## 2018-02-25 10:49:56,061 INFO Starting barcode extraction
-## 2018-02-25 10:49:56,061 INFO Parsed 0 reads
-## 2018-02-25 10:49:56,062 INFO Starting - whitelist determination
-## 2018-02-25 10:49:57,383 INFO Finished - whitelist determination
-## 2018-02-25 10:49:57,383 INFO Starting - finding putative error cell barcodes
-## 2018-02-25 10:49:57,383 INFO Finished - finding putative error cell barcodes
-## 2018-02-25 10:49:57,383 INFO Writing out whitelist
 AAAAAAAA AAAAAACA,AAACAAAA,AATAAAAA 3 1,1,1
 ACAAAAAC 2
 ACAACAAA 2
 TTACTTAA TTACTAAA 2 1
-## 2018-02-25 10:49:57,383 INFO Parsed 100 reads
-## 2018-02-25 10:49:57,383 INFO 100 reads matched the barcode pattern
-## 2018-02-25 10:49:57,383 INFO Found 95 unique cell barcodes
-# job finished in 1 seconds at Sun Feb 25 10:49:57 2018 --  2.25  0.06  0.00  0.00 -- cb0db520-8a4e-4040-aa88-93efc0718fa8
b
diff -r 0c721837cbcf -r 3adbf2fa0928 test-data/out_wl_user.single.html
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/out_wl_user.single.html Tue Aug 27 17:11:52 2019 -0400
b
@@ -0,0 +1,1 @@
+<html> <head></head><body> <h1>Cell and Count Metrics</h1> <img src="OUT_cell_barcode_count_density.png" ><br /> <img src="OUT_cell_barcode_knee.png" ><br /> <img src="OUT_cell_barcode_counts.png" ><br /> </body></html>
b
diff -r 0c721837cbcf -r 3adbf2fa0928 test-data/out_wl_user.single.tresh.tab
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/out_wl_user.single.tresh.tab Tue Aug 27 17:11:52 2019 -0400
b
@@ -0,0 +1,7 @@
+count action
+2 Rejected
+8 Rejected
+11 Rejected
+27 Rejected
+90 Selected
+404 Rejected
b
diff -r 0c721837cbcf -r 3adbf2fa0928 test-data/out_wl_user.single.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/out_wl_user.single.txt Tue Aug 27 17:11:52 2019 -0400
b
@@ -0,0 +1,90 @@
+AACCGCCTCAGGCAGCTATCTCGGTTA 6
+AACCGCCTCCTCTCTCAAACTACATAT TACCGCCTCCTCTCTCAAACTACATAT 4 1
+AAGACATGCAGTGAAAGGACAAACTTT 3
+ACACGCCGGTTCCATTGAGCAACATTA 3
+ACCAAGGACACTGTTAGAAAGTTTACG ACCAAGGACACTGTAAGAAAGTTTACG 3 1
+ACCCTGACCACAGTGGTACTACATGCG 4
+ACGAAAGGTCGTTCGGTATACTTCGGA TCGAAAGGTCGTTCGGTATACTTCGGA 3 1
+AGAGCTATGAAGAGTAGAAAGCCTTCT 3
+AGCCATCACAGCGACACCTCTCACGGA TGCCATCACAGCGACACCTCTCACGGA 6 1
+ATCAGAGCTAACCCTCGGAAGGGTCAG TTCAGAGCTAACCCTCGGAAGGGTCAG 3 1
+ATCCCGGAGGTACATCTACATAGGTCA 3
+ATGTAATGGTAGTCTTGAACGCTGTTG 3
+CAAACCGCCACGACCACCAAGTTTACG 3
+CAAACCGCCTGACAGACAAATGTATCG CAAACCGTCTGACAGACAAATGTATCG,GAAACCGCCTGACAGACAAATGTATCG 3 1,1
+CAAAGGCACAACCAAAGTACTAGACCG 3
+CAAAGGCACAACCCTCGGACTCATACG CAAAGGCACGACCCTCGGACTCATACG,GAAAGGCACAACCCTCGGACTCATACG 3 1,1
+CAAAGGCACAATCAGTTTATGGGACTC 3
+CAAAGGCACGATGAACTGGCTCCTTGA GAAAGGCACGATGAACTGGCTCCTTGA 4 1
+CAAAGGCACTAGGGATACCATAGGTCA GAAAGGCACTAGGGATACCATAGGTCA 4 1
+CAAGCAAGCGATGAACTGCAATACAAG 3
+CACACACTACACCCAAAGGTAGAAGCA GACACACTACACCCAAAGGTAGAAGCA 4 1
+CACATTGCAAAACGTACCAAGGCCGCA 3
+CAGCAAGATTTGGAGGTAAGTCTGTAC 3
+CAGCTGACATACAGGATATCCGTCTTA 3
+CCACTTGGAAAACTGCGCAAATGGAGG 3
+CTATGAAATCTCTCTCAAAAATTACAG GTATGAAATCTCTCTCAAAAATTACAG 4 1
+CTTCACATAGTACATCTAGATCAGCGA 3
+GAATCTGTAAAGGACTATAAAGTCATT 4
+GACGGATTAAGTGTTGTCACTCATACG 3
+GACGGATTATATAGCCCTCAATAGGGT 3
+GAGAATCGTCAAGACCTACAGCCTGGC CAGAATCGTCAAGACCTACAGCCTGGC,GGGAATCGTCAAGACCTACAGCCTGGC 3 1,1
+GAGGTGCTAGCCAATGTAGATAGAGGA 3
+GAGTACATTACCAAAATGTGAAGCCAA 3
+GAGTACATTTTGGAGGTAACAACTAGT 3
+GCAATCCGAAGATAGTTCCAGCCTGGC GCAATCCGAAGATAGTACCAGCCTGGC 4 1
+GCAATCCGAAGGCAGCTAAATTCGGCG GCAATCCGAAGGCAGCTGAATTCGGCG 3 1
+GCAATCCGAGAGTGCGAATCCTCAATA GCAATCCGAGAGAGCGAATCCTCAATA,GCAATCCGAGAGTGCGTATCCTCAATA 5 1,1
+GCAATCCGAGCCAATGTAGATGGTCCA 3
+GCACTGTCAGATGAACTGCAAGTAGAA 3
+GCACTGTCAGTGTGTCGACGTCTAGGT 3
+GCCTTACAAGTGCAGTAACTCAAGACA 3
+GCGATTACAAAGCTACTTACTTACGAT GCGATTACAAAGCTACTTCCTTACGAT 4 1
+GCGATTACAATCAACCGAGGCACAACA 3
+GCGATTACATGTTCTCCACAATAGGGT 3
+GCTGCCAATAGGAGGCGCTGAAGCCAA 3
+GCTGCCAATATTCATCGTAGTTGTTCT GCTGCCAATATTCATCGTAGTTGTTCC 7 1
+GCTGCCAATCAACAACGGTGAAGCCAA GCTGCCAATCATCAACGGTGAAGCCAA 6 1
+GCTGCCAATCACCTACCCAAGCCTTCT GATGCCAATCACCTACCCAAGCCTTCT 3 1
+GGATTAGGAAGCTGCCGTGGCACAACA GGAGTAGGAAGCTGCCGTGGCACAACA 3 1
+GGCAAGCAAACCAAAATGACTTACGAT 4
+GGCAAGCAAGATGAACTGCTAAGCTTC 4
+GGCAAGCAATAGTGACTACTACATGCG 3
+GTACAGAACAATCCTGAACTATTAGCC CTACAGAACAATCCTGAACTATTAGCC 3 1
+GTATGAAATATCCAACCGCTGTACGGA 3
+TAAGCGTTACACAAAGGCCACAAGTAT 5
+TAAGCGTTATATAGCCCTCAACCTCCA 4
+TACCGAGCAATCCTAGGATCTCACGGA 3
+TACCGAGCACAAGACCTACGTACTACG 5
+TACCGAGCAGTGTGTCGATCTCACGGA 3
+TACCGCCTCAGGCAGCTATCTCGGTTA TACCGCCTCAGGCAGCTATCTCGGTCA 3 1
+TAGGCGATCAAGCTACTTCGTACTACG AAGGCGATCAAGCTACTTCGTACTACG 3 1
+TAGGCGATCGCTAACTCAACAACTAGT AAGGCGATCGCTAACTCAACAACTAGT 3 1
+TATATTGGGCCGACAAGAAAGGGTCAG AATATTGGGCCGACAAGAAAGGGTCAG,TCTATTGGGCCGACAAGAAAGGGTCAG 3 1,1
+TATCGAATGAAGAGTAGACAGGCATTT AATCGAATGAAGAGTAGACAGGCATTT 3 1
+TCACACAAAAAATAAATATGTATGCGA ACACACAAAAAATAAATATGTATGCGA,TCGCACAAAAAATAAATATGTATGCGA 3 2,1
+TCACGAGATACTTCGAGCAAGCCTTCT ACACGAGATACTTCGAGCAAGCCTTCT,CCACGAGATACTTCGAGCAAGCCTTCT 3 2,1
+TCACGAGATCAACAACGGCGTACTACG ACACGAGATCAACAACGGCGTACTACG 3 1
+TCAGGAGGAACTTGATGACAAGTAGAA 4
+TCAGGAGGACACTTATGTCTAAGCTTC TCAGGAGGACACTTATGTCTACGCTTC,TCAGGAGGACACTTATGTCTCAGCTTC,TNAGGAGGACACTTATGTCTAAGCTTC 6 1,1,1
+TCAGGAGGACTTGCTTCAATCGAGTCT TCAGGAGGACTTGCTTCCATCGAGTCT 3 1
+TCATAAGCGATCCTAGGATTCAGCTCA ACATAAGCGATCCTAGGATTCAGCTCA 3 1
+TCATAAGCGGACACTTAAAGCCGCAAG ACATAAGCGGACACTTAAAGCCGCAAG,TCATAAGCGGACACTTAAAGCCGCGAG,TTATAAGCGGACACTTAAAGCCGCAAG 3 1,1,1
+TCGAAAGGTATTACCAAGGCAACATTA ACGAAAGGTATTACCAAGGCAACATTA 4 1
+TGAACTTCCACAATATGGCAAGTTTCC AGAACTTCCACAATATGGCAAGTTTCC 7 2
+TGAACTTCCATGAGTTACCGTACTACG AGAACTTCCATGAGTTACCGTACTACG,TGAACTTCCATGAGTTACCTTACTACG 6 2,1
+TGAAGCACTCACTCGAGAACACCTTAG 4
+TGAAGCACTGCCAATGTAGAACGACAA AGAAGCACTGCCAATGTAGAACGACAA 3 2
+TGAGCTATGAAGAGTAGAAAGCCTTCT 3
+TGCGATCTACCGTATCTAACAAACTTT 4
+TGGAGATTAATTAGGCATACAGTAAAC 6
+TGGCTCAGAAAGCTACTTGCTCCTTGA TGACTCAGAAAGCTACTTGCTCCTTGA 3 1
+TGGCTCAGAAGAATGGAGAATTCGGCG TGGTTCAGAAGAATGGAGAATTCGGCG 3 1
+TGGCTCAGATGACAGACAAAGGGAACT TGGCTCAGATGACAGACAAAGGGACCT 3 1
+TGTACCTTAGCCAATGTAAAGGGTCAG 3
+TTACTTAGGATGAGTTACTCCGTCTTA ATACTTAGGATGAGTTACTCCGTCTTA 3 1
+TTGTTCCAAAGGTTCGCTACTCATACG 3
+TTTATTACCCCATGCACACATAGGTCA ATTATTACCCCATGCACACATAGGTCA 3 2
+TTTATTACCGTGCAGTAAACTCATACG ATTATTACCGTGCAGTAAACTCATACG 3 1
+TTTGGCTAAACTATCCTCACTACATAT ATTGGCTAAACTATCCTCACTACATAT 3 2
+TTTTAGATGGATGAACTGCAACGATCT ATTTAGATGGATGAACTGCAACGATCT 3 1
b
diff -r 0c721837cbcf -r 3adbf2fa0928 test-data/testYYY.40k.fastq.gz
b
Binary file test-data/testYYY.40k.fastq.gz has changed
b
diff -r 0c721837cbcf -r 3adbf2fa0928 umi-tools_whitelist.xml
--- a/umi-tools_whitelist.xml Fri Jul 20 03:49:25 2018 -0400
+++ b/umi-tools_whitelist.xml Tue Aug 27 17:11:52 2019 -0400
[
@@ -1,4 +1,4 @@
-<tool id="umi_tools_whitelist" name="UMI-tools whitelist" version="@VERSION@.1">
+<tool id="umi_tools_whitelist" name="UMI-tools whitelist" version="@VERSION@.0">
     <description>Extract cell barcodes from FASTQ files</description>
     <macros>
         <import>macros.xml</import>
@@ -9,6 +9,7 @@
 
         umi_tools whitelist
             --bc-pattern='$bc_pattern'
+            --extract-method='$extract_method'
             --subset-reads='$subset_reads'
             #if $input_type.type == 'single':
                 #if $gz:
@@ -70,13 +71,18 @@
                     represent the random positions and Xs to indicate the bc positions.
                     Bases with Ns will be extracted and added to the read name. Remaining
                     bases, marked with an X will be reattached to the read.">
+            <expand macro="barcode_sanitizer" />
+        </param>
+        <param name="extract_method" argument="--extract-method" type="select" label="Barcode Extraction Method"
+               help="If bracketed expressions are used in the above barcode pattern, then set this to 'regex'. Otherwise leave as 'string'" >
+            <option value="string" selected="true" />
+            <option value="regex" />
         </param>
         <param name="method" argument="--method" type="select" label="Count reads or UMIs"
                help="Many published protocols rank CBs by the number of reads the CBs appear in. However you could also use the number of unique UMIs a CB is associated with. Note that this is still and approximation to the number of transcripts captured because the same UMI could be associated with two different transcripts and be counted as independent." >
             <option value="reads" selected="true" />
             <option value="umis" />
         </param>
-
         <param argument="--3prime" name="prime3" type="boolean" label="Is barcode on 3' end of the read?"
             truevalue="--3prime" falsevalue=""
             help="By default the barcode is assumed to be on the 5' end of the read, but
@@ -91,12 +97,11 @@
             <when value="advanced">
                 <param name="set_cell_number" type="integer" min="0" value="0" label="Specify the number of cell barcodes to accept" />
                 <param name="expect_cells" type="integer" min="0" value="0" label="Prior expectation on the upper limit on the number of cells sequenced" />
-                <param name="error_correct_thresh" type="integer" min="0" value="0" label="Hamming distance for correction of barcodes to whilelist barcodes. Set to zero to generate no error correcting metrics." />
+                <param name="error_correct_thresh" type="integer" min="0" value="0" label="Hamming distance for correction of barcodes to whitelist barcodes. Set to zero to generate no error correcting metrics." />
             </when>
         </conditional>
         <param argument="--log" type="boolean" label="Output log?" truevalue="--log" falsevalue=""
             help="Choose if you want to generate a text file containing logging information." />
-
     </inputs>
     <outputs>
         <data name="out_whitelist" format="tabular" label="${tool.name} on ${on_string}: Whitelist"/>
@@ -108,8 +113,10 @@
     </outputs>
     <tests>
         <test expect_num_outputs="3">
-            <param name="type" value="single" />
-            <param name="input_single" value="t_R2.fastq.gz" ftype="fastq" />
+            <conditional name="input_type" >
+                <param name="type" value="single" />
+                <param name="input_single" value="t_R2.fastq.gz" ftype="fastq.gz" />
+            </conditional>
             <param name="bc_pattern" value="CCCCCCCCNNNNNNNN" />
             <param name="method" value="reads" />
             <param name="prime3" value="true" />
@@ -118,9 +125,11 @@
             <output name="out_html_report" file="out_wl_single.html" />
         </test>
         <test expect_num_outputs="4">
-            <param name="type" value="paired" />
-            <param name="input_read1" value="t_R1.fastq.gz" ftype="fastq" />
-            <param name="input_read2" value="t_R2.fastq.gz" ftype="fastq" />
+            <conditional name="input_type" >
+                <param name="type" value="paired" />
+                <param name="input_read1" value="t_R1.fastq.gz" ftype="fastq.gz" />
+                <param name="input_read2" value="t_R2.fastq.gz" ftype="fastq.gz" />
+            </conditional>
             <param name="barcode_select" value="first_read_only" />
             <param name="bc_pattern" value="CCCNNNNNNNNXXXXX" />
             <param name="bc_pattern2" value="CCCCCCCCNNNNNNNN" />
@@ -136,13 +145,15 @@
             <output name="out_thresh" file="out_wl_paired.tresh.tab" />
         </test>
         <test expect_num_outputs="4"> <!-- As previous, identical outputs but paired collection input -->
-            <param name="type" value="paired_collection" />
-            <param name="input_readpair" >
-                <collection type="paired">
-                    <element name="forward" ftype="fastq.gz" value="t_R1.fastq.gz" />
-                    <element name="reverse" ftype="fastq.gz" value="t_R2.fastq.gz" />
-                </collection>
-            </param>
+            <conditional name="input_type" >
+                <param name="type" value="paired_collection" />
+                <param name="input_readpair" >
+                    <collection type="paired">
+                        <element name="forward" ftype="fastq.gz" value="t_R1.fastq.gz" />
+                        <element name="reverse" ftype="fastq.gz" value="t_R2.fastq.gz" />
+                    </collection>
+                </param>
+            </conditional>
             <param name="barcode_select" value="first_read_only" />
             <param name="bc_pattern"  value="CCCNNNNNNNNXXXXX" />
             <param name="bc_pattern2" value="CCCCCCCCNNNNNNNN" />
@@ -157,6 +168,20 @@
             <output name="out_html_report" file="out_wl_paired.html" />
             <output name="out_thresh" file="out_wl_paired.tresh.tab" />
         </test>
+        <!-- Error report on not accepting regex and lt and gt symbols -->
+        <test expect_num_outputs="3">
+            <conditional name="input_type" >
+                <param name="type" value="single" />
+                <param name="input_single" value="testYYY.40k.fastq.gz" ftype="fastq.gz" />
+            </conditional>
+            <param name="bc_pattern" value="(?P&#60;cell_1&#62;.{8,10})(?P&#60;discard_1&#62;ACTGGCCTGCGA){s&#60;=3}(?P&#60;cell_2&#62;.{9})(?P&#60;discard_2&#62;GGTAGCGGTGACA){s&#60;=3}(?P&#60;cell_3&#62;.{9})(?P&#60;umi_1&#62;.{8})T{3}.*" />
+            <param name="extract_method" value="regex" />
+            <param name="method" value="umis" />
+            <param name="prime3" value="true" />
+            <output name="out_whitelist" file="out_wl_user.single.txt" />
+            <output name="out_thresh" file="out_wl_user.single.tresh.tab" />
+            <output name="out_html_report" file="out_wl_user.single.html" />
+        </test>
     </tests>
     <help><![CDATA[