changeset 10:3adbf2fa0928 draft

"planemo upload commit 28e58376e1d70e38276873a7d5e2ab44db88c2c0"
author iuc
date Tue, 27 Aug 2019 17:11:52 -0400
parents 0c721837cbcf
children b9e787edcbbd
files macros.xml test-data/out_wl_single.txt test-data/out_wl_user.single.html test-data/out_wl_user.single.tresh.tab test-data/out_wl_user.single.txt test-data/testYYY.40k.fastq.gz umi-tools_whitelist.xml
diffstat 7 files changed, 159 insertions(+), 57 deletions(-) [+]
line wrap: on
line diff
--- a/macros.xml	Fri Jul 20 03:49:25 2018 -0400
+++ b/macros.xml	Tue Aug 27 17:11:52 2019 -0400
@@ -1,5 +1,23 @@
 <?xml version="1.0"?>
 <macros>
+    <macro name="barcode_sanitizer" >
+        <sanitizer invalid_char="">
+            <valid initial="string.letters,string.digits">
+                <add value="&#40;" /><!-- left bracket -->
+                <add value="&#41;" /><!-- right bracket -->
+                <add value="&#42;" /><!-- asterisk -->
+                <add value="&#44;" /><!-- comma -->
+                <add value="&#46;" /><!-- period -->
+                <add value="&#60;" /><!-- less than -->
+                <add value="&#61;" /><!-- equals sign -->
+                <add value="&#62;" /><!-- greater than -->
+                <add value="&#63;" /><!-- question mark -->
+                <add value="&#95;" /><!-- underscore -->
+                <add value="&#123;"/><!-- left brace -->
+                <add value="&#125;"/><!-- right brace -->
+            </valid>
+        </sanitizer>
+    </macro>
     <macro name="barcode2_conditional" >
         <conditional name="barcode">
             <param name="barcode_select" argument="--split-barcode" type="select" label="Barcode on both reads?">
@@ -11,6 +29,7 @@
                 <param name="bc_pattern2" argument="--bc-pattern2" type="text" value="" label="Barcode pattern for second read"
                        help="Use this option to specify the format of the UMI/barcode for
                              the second read pair if required." >
+                    <expand macro="barcode_sanitizer" />
                 </param>
             </when>
         </conditional>
@@ -55,7 +74,7 @@
             <yield />
         </requirements>
     </xml>
-    <token name="@VERSION@">0.5.3</token>
+    <token name="@VERSION@">0.5.5</token>
     <token name="@COMMAND_LINK@"><![CDATA[
         #set $gz = False
         #if $input_type.type == 'single':
--- a/test-data/out_wl_single.txt	Fri Jul 20 03:49:25 2018 -0400
+++ b/test-data/out_wl_single.txt	Tue Aug 27 17:11:52 2019 -0400
@@ -1,44 +1,4 @@
-# output generated by whitelist --bc-pattern=CCCCCCCCNNNNNNNN --subset-reads=0 --stdin=/tmp/tmpibtvD6/files/000/dataset_1.dat --method=reads --plot-prefix=OUT --3prime
-# job started at Sun Feb 25 10:49:56 2018 on bag -- cb0db520-8a4e-4040-aa88-93efc0718fa8
-# pid: 2217, system: Linux 4.13.0-32-generic #35-Ubuntu SMP Thu Jan 25 09:13:46 UTC 2018 x86_64
-# blacklist_tsv                           : None
-# cell_number                             : False
-# compresslevel                           : 6
-# error_correct_threshold                 : 1
-# expect_cells                            : False
-# extract_method                          : string
-# filter_cell_barcodes                    : False
-# log2stderr                              : False
-# loglevel                                : 1
-# method                                  : reads
-# pattern                                 : CCCCCCCCNNNNNNNN
-# pattern2                                : None
-# plot_prefix                             : OUT
-# prime3                                  : True
-# random_seed                             : None
-# read2_in                                : None
-# short_help                              : None
-# stderr                                  : <_io.TextIOWrapper name='<stderr>' mode='w' encoding='UTF-8'>
-# stdin                                   : <_io.TextIOWrapper name='/tmp/tmpibtvD6/files/000/dataset_1.dat' mode='r' encoding='UTF-8'>
-# stdlog                                  : <_io.TextIOWrapper name='<stdout>' mode='w' encoding='UTF-8'>
-# stdout                                  : <_io.TextIOWrapper name='<stdout>' mode='w' encoding='UTF-8'>
-# subset_reads                            : 0
-# timeit_file                             : None
-# timeit_header                           : None
-# timeit_name                             : all
-# whitelist_tsv                           : None
-## 2018-02-25 10:49:56,061 INFO Starting barcode extraction
-## 2018-02-25 10:49:56,061 INFO Parsed 0 reads
-## 2018-02-25 10:49:56,062 INFO Starting - whitelist determination
-## 2018-02-25 10:49:57,383 INFO Finished - whitelist determination
-## 2018-02-25 10:49:57,383 INFO Starting - finding putative error cell barcodes
-## 2018-02-25 10:49:57,383 INFO Finished - finding putative error cell barcodes
-## 2018-02-25 10:49:57,383 INFO Writing out whitelist
 AAAAAAAA	AAAAAACA,AAACAAAA,AATAAAAA	3	1,1,1
 ACAAAAAC		2	
 ACAACAAA		2	
 TTACTTAA	TTACTAAA	2	1
-## 2018-02-25 10:49:57,383 INFO Parsed 100 reads
-## 2018-02-25 10:49:57,383 INFO 100 reads matched the barcode pattern
-## 2018-02-25 10:49:57,383 INFO Found 95 unique cell barcodes
-# job finished in 1 seconds at Sun Feb 25 10:49:57 2018 --  2.25  0.06  0.00  0.00 -- cb0db520-8a4e-4040-aa88-93efc0718fa8
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/out_wl_user.single.html	Tue Aug 27 17:11:52 2019 -0400
@@ -0,0 +1,1 @@
+<html> <head></head><body> <h1>Cell and Count Metrics</h1> <img src="OUT_cell_barcode_count_density.png" ><br /> <img src="OUT_cell_barcode_knee.png" ><br /> <img src="OUT_cell_barcode_counts.png" ><br /> </body></html>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/out_wl_user.single.tresh.tab	Tue Aug 27 17:11:52 2019 -0400
@@ -0,0 +1,7 @@
+count	action
+2	Rejected
+8	Rejected
+11	Rejected
+27	Rejected
+90	Selected
+404	Rejected
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/out_wl_user.single.txt	Tue Aug 27 17:11:52 2019 -0400
@@ -0,0 +1,90 @@
+AACCGCCTCAGGCAGCTATCTCGGTTA		6	
+AACCGCCTCCTCTCTCAAACTACATAT	TACCGCCTCCTCTCTCAAACTACATAT	4	1
+AAGACATGCAGTGAAAGGACAAACTTT		3	
+ACACGCCGGTTCCATTGAGCAACATTA		3	
+ACCAAGGACACTGTTAGAAAGTTTACG	ACCAAGGACACTGTAAGAAAGTTTACG	3	1
+ACCCTGACCACAGTGGTACTACATGCG		4	
+ACGAAAGGTCGTTCGGTATACTTCGGA	TCGAAAGGTCGTTCGGTATACTTCGGA	3	1
+AGAGCTATGAAGAGTAGAAAGCCTTCT		3	
+AGCCATCACAGCGACACCTCTCACGGA	TGCCATCACAGCGACACCTCTCACGGA	6	1
+ATCAGAGCTAACCCTCGGAAGGGTCAG	TTCAGAGCTAACCCTCGGAAGGGTCAG	3	1
+ATCCCGGAGGTACATCTACATAGGTCA		3	
+ATGTAATGGTAGTCTTGAACGCTGTTG		3	
+CAAACCGCCACGACCACCAAGTTTACG		3	
+CAAACCGCCTGACAGACAAATGTATCG	CAAACCGTCTGACAGACAAATGTATCG,GAAACCGCCTGACAGACAAATGTATCG	3	1,1
+CAAAGGCACAACCAAAGTACTAGACCG		3	
+CAAAGGCACAACCCTCGGACTCATACG	CAAAGGCACGACCCTCGGACTCATACG,GAAAGGCACAACCCTCGGACTCATACG	3	1,1
+CAAAGGCACAATCAGTTTATGGGACTC		3	
+CAAAGGCACGATGAACTGGCTCCTTGA	GAAAGGCACGATGAACTGGCTCCTTGA	4	1
+CAAAGGCACTAGGGATACCATAGGTCA	GAAAGGCACTAGGGATACCATAGGTCA	4	1
+CAAGCAAGCGATGAACTGCAATACAAG		3	
+CACACACTACACCCAAAGGTAGAAGCA	GACACACTACACCCAAAGGTAGAAGCA	4	1
+CACATTGCAAAACGTACCAAGGCCGCA		3	
+CAGCAAGATTTGGAGGTAAGTCTGTAC		3	
+CAGCTGACATACAGGATATCCGTCTTA		3	
+CCACTTGGAAAACTGCGCAAATGGAGG		3	
+CTATGAAATCTCTCTCAAAAATTACAG	GTATGAAATCTCTCTCAAAAATTACAG	4	1
+CTTCACATAGTACATCTAGATCAGCGA		3	
+GAATCTGTAAAGGACTATAAAGTCATT		4	
+GACGGATTAAGTGTTGTCACTCATACG		3	
+GACGGATTATATAGCCCTCAATAGGGT		3	
+GAGAATCGTCAAGACCTACAGCCTGGC	CAGAATCGTCAAGACCTACAGCCTGGC,GGGAATCGTCAAGACCTACAGCCTGGC	3	1,1
+GAGGTGCTAGCCAATGTAGATAGAGGA		3	
+GAGTACATTACCAAAATGTGAAGCCAA		3	
+GAGTACATTTTGGAGGTAACAACTAGT		3	
+GCAATCCGAAGATAGTTCCAGCCTGGC	GCAATCCGAAGATAGTACCAGCCTGGC	4	1
+GCAATCCGAAGGCAGCTAAATTCGGCG	GCAATCCGAAGGCAGCTGAATTCGGCG	3	1
+GCAATCCGAGAGTGCGAATCCTCAATA	GCAATCCGAGAGAGCGAATCCTCAATA,GCAATCCGAGAGTGCGTATCCTCAATA	5	1,1
+GCAATCCGAGCCAATGTAGATGGTCCA		3	
+GCACTGTCAGATGAACTGCAAGTAGAA		3	
+GCACTGTCAGTGTGTCGACGTCTAGGT		3	
+GCCTTACAAGTGCAGTAACTCAAGACA		3	
+GCGATTACAAAGCTACTTACTTACGAT	GCGATTACAAAGCTACTTCCTTACGAT	4	1
+GCGATTACAATCAACCGAGGCACAACA		3	
+GCGATTACATGTTCTCCACAATAGGGT		3	
+GCTGCCAATAGGAGGCGCTGAAGCCAA		3	
+GCTGCCAATATTCATCGTAGTTGTTCT	GCTGCCAATATTCATCGTAGTTGTTCC	7	1
+GCTGCCAATCAACAACGGTGAAGCCAA	GCTGCCAATCATCAACGGTGAAGCCAA	6	1
+GCTGCCAATCACCTACCCAAGCCTTCT	GATGCCAATCACCTACCCAAGCCTTCT	3	1
+GGATTAGGAAGCTGCCGTGGCACAACA	GGAGTAGGAAGCTGCCGTGGCACAACA	3	1
+GGCAAGCAAACCAAAATGACTTACGAT		4	
+GGCAAGCAAGATGAACTGCTAAGCTTC		4	
+GGCAAGCAATAGTGACTACTACATGCG		3	
+GTACAGAACAATCCTGAACTATTAGCC	CTACAGAACAATCCTGAACTATTAGCC	3	1
+GTATGAAATATCCAACCGCTGTACGGA		3	
+TAAGCGTTACACAAAGGCCACAAGTAT		5	
+TAAGCGTTATATAGCCCTCAACCTCCA		4	
+TACCGAGCAATCCTAGGATCTCACGGA		3	
+TACCGAGCACAAGACCTACGTACTACG		5	
+TACCGAGCAGTGTGTCGATCTCACGGA		3	
+TACCGCCTCAGGCAGCTATCTCGGTTA	TACCGCCTCAGGCAGCTATCTCGGTCA	3	1
+TAGGCGATCAAGCTACTTCGTACTACG	AAGGCGATCAAGCTACTTCGTACTACG	3	1
+TAGGCGATCGCTAACTCAACAACTAGT	AAGGCGATCGCTAACTCAACAACTAGT	3	1
+TATATTGGGCCGACAAGAAAGGGTCAG	AATATTGGGCCGACAAGAAAGGGTCAG,TCTATTGGGCCGACAAGAAAGGGTCAG	3	1,1
+TATCGAATGAAGAGTAGACAGGCATTT	AATCGAATGAAGAGTAGACAGGCATTT	3	1
+TCACACAAAAAATAAATATGTATGCGA	ACACACAAAAAATAAATATGTATGCGA,TCGCACAAAAAATAAATATGTATGCGA	3	2,1
+TCACGAGATACTTCGAGCAAGCCTTCT	ACACGAGATACTTCGAGCAAGCCTTCT,CCACGAGATACTTCGAGCAAGCCTTCT	3	2,1
+TCACGAGATCAACAACGGCGTACTACG	ACACGAGATCAACAACGGCGTACTACG	3	1
+TCAGGAGGAACTTGATGACAAGTAGAA		4	
+TCAGGAGGACACTTATGTCTAAGCTTC	TCAGGAGGACACTTATGTCTACGCTTC,TCAGGAGGACACTTATGTCTCAGCTTC,TNAGGAGGACACTTATGTCTAAGCTTC	6	1,1,1
+TCAGGAGGACTTGCTTCAATCGAGTCT	TCAGGAGGACTTGCTTCCATCGAGTCT	3	1
+TCATAAGCGATCCTAGGATTCAGCTCA	ACATAAGCGATCCTAGGATTCAGCTCA	3	1
+TCATAAGCGGACACTTAAAGCCGCAAG	ACATAAGCGGACACTTAAAGCCGCAAG,TCATAAGCGGACACTTAAAGCCGCGAG,TTATAAGCGGACACTTAAAGCCGCAAG	3	1,1,1
+TCGAAAGGTATTACCAAGGCAACATTA	ACGAAAGGTATTACCAAGGCAACATTA	4	1
+TGAACTTCCACAATATGGCAAGTTTCC	AGAACTTCCACAATATGGCAAGTTTCC	7	2
+TGAACTTCCATGAGTTACCGTACTACG	AGAACTTCCATGAGTTACCGTACTACG,TGAACTTCCATGAGTTACCTTACTACG	6	2,1
+TGAAGCACTCACTCGAGAACACCTTAG		4	
+TGAAGCACTGCCAATGTAGAACGACAA	AGAAGCACTGCCAATGTAGAACGACAA	3	2
+TGAGCTATGAAGAGTAGAAAGCCTTCT		3	
+TGCGATCTACCGTATCTAACAAACTTT		4	
+TGGAGATTAATTAGGCATACAGTAAAC		6	
+TGGCTCAGAAAGCTACTTGCTCCTTGA	TGACTCAGAAAGCTACTTGCTCCTTGA	3	1
+TGGCTCAGAAGAATGGAGAATTCGGCG	TGGTTCAGAAGAATGGAGAATTCGGCG	3	1
+TGGCTCAGATGACAGACAAAGGGAACT	TGGCTCAGATGACAGACAAAGGGACCT	3	1
+TGTACCTTAGCCAATGTAAAGGGTCAG		3	
+TTACTTAGGATGAGTTACTCCGTCTTA	ATACTTAGGATGAGTTACTCCGTCTTA	3	1
+TTGTTCCAAAGGTTCGCTACTCATACG		3	
+TTTATTACCCCATGCACACATAGGTCA	ATTATTACCCCATGCACACATAGGTCA	3	2
+TTTATTACCGTGCAGTAAACTCATACG	ATTATTACCGTGCAGTAAACTCATACG	3	1
+TTTGGCTAAACTATCCTCACTACATAT	ATTGGCTAAACTATCCTCACTACATAT	3	2
+TTTTAGATGGATGAACTGCAACGATCT	ATTTAGATGGATGAACTGCAACGATCT	3	1
Binary file test-data/testYYY.40k.fastq.gz has changed
--- a/umi-tools_whitelist.xml	Fri Jul 20 03:49:25 2018 -0400
+++ b/umi-tools_whitelist.xml	Tue Aug 27 17:11:52 2019 -0400
@@ -1,4 +1,4 @@
-<tool id="umi_tools_whitelist" name="UMI-tools whitelist" version="@VERSION@.1">
+<tool id="umi_tools_whitelist" name="UMI-tools whitelist" version="@VERSION@.0">
     <description>Extract cell barcodes from FASTQ files</description>
     <macros>
         <import>macros.xml</import>
@@ -9,6 +9,7 @@
 
         umi_tools whitelist
             --bc-pattern='$bc_pattern'
+            --extract-method='$extract_method'
             --subset-reads='$subset_reads'
             #if $input_type.type == 'single':
                 #if $gz:
@@ -70,13 +71,18 @@
                     represent the random positions and Xs to indicate the bc positions.
                     Bases with Ns will be extracted and added to the read name. Remaining
                     bases, marked with an X will be reattached to the read.">
+            <expand macro="barcode_sanitizer" />
+        </param>
+        <param name="extract_method" argument="--extract-method" type="select" label="Barcode Extraction Method"
+               help="If bracketed expressions are used in the above barcode pattern, then set this to 'regex'. Otherwise leave as 'string'" >
+            <option value="string" selected="true" />
+            <option value="regex" />
         </param>
         <param name="method" argument="--method" type="select" label="Count reads or UMIs"
                help="Many published protocols rank CBs by the number of reads the CBs appear in. However you could also use the number of unique UMIs a CB is associated with. Note that this is still and approximation to the number of transcripts captured because the same UMI could be associated with two different transcripts and be counted as independent." >
             <option value="reads" selected="true" />
             <option value="umis" />
         </param>
-
         <param argument="--3prime" name="prime3" type="boolean" label="Is barcode on 3' end of the read?"
             truevalue="--3prime" falsevalue=""
             help="By default the barcode is assumed to be on the 5' end of the read, but
@@ -91,12 +97,11 @@
             <when value="advanced">
                 <param name="set_cell_number" type="integer" min="0" value="0" label="Specify the number of cell barcodes to accept" />
                 <param name="expect_cells" type="integer" min="0" value="0" label="Prior expectation on the upper limit on the number of cells sequenced" />
-                <param name="error_correct_thresh" type="integer" min="0" value="0" label="Hamming distance for correction of barcodes to whilelist barcodes. Set to zero to generate no error correcting metrics." />
+                <param name="error_correct_thresh" type="integer" min="0" value="0" label="Hamming distance for correction of barcodes to whitelist barcodes. Set to zero to generate no error correcting metrics." />
             </when>
         </conditional>
         <param argument="--log" type="boolean" label="Output log?" truevalue="--log" falsevalue=""
             help="Choose if you want to generate a text file containing logging information." />
-
     </inputs>
     <outputs>
         <data name="out_whitelist" format="tabular" label="${tool.name} on ${on_string}: Whitelist"/>
@@ -108,8 +113,10 @@
     </outputs>
     <tests>
         <test expect_num_outputs="3">
-            <param name="type" value="single" />
-            <param name="input_single" value="t_R2.fastq.gz" ftype="fastq" />
+            <conditional name="input_type" >
+                <param name="type" value="single" />
+                <param name="input_single" value="t_R2.fastq.gz" ftype="fastq.gz" />
+            </conditional>
             <param name="bc_pattern" value="CCCCCCCCNNNNNNNN" />
             <param name="method" value="reads" />
             <param name="prime3" value="true" />
@@ -118,9 +125,11 @@
             <output name="out_html_report" file="out_wl_single.html" />
         </test>
         <test expect_num_outputs="4">
-            <param name="type" value="paired" />
-            <param name="input_read1" value="t_R1.fastq.gz" ftype="fastq" />
-            <param name="input_read2" value="t_R2.fastq.gz" ftype="fastq" />
+            <conditional name="input_type" >
+                <param name="type" value="paired" />
+                <param name="input_read1" value="t_R1.fastq.gz" ftype="fastq.gz" />
+                <param name="input_read2" value="t_R2.fastq.gz" ftype="fastq.gz" />
+            </conditional>
             <param name="barcode_select" value="first_read_only" />
             <param name="bc_pattern" value="CCCNNNNNNNNXXXXX" />
             <param name="bc_pattern2" value="CCCCCCCCNNNNNNNN" />
@@ -136,13 +145,15 @@
             <output name="out_thresh" file="out_wl_paired.tresh.tab" />
         </test>
         <test expect_num_outputs="4"> <!-- As previous, identical outputs but paired collection input -->
-            <param name="type" value="paired_collection" />
-            <param name="input_readpair" >
-                <collection type="paired">
-                    <element name="forward" ftype="fastq.gz" value="t_R1.fastq.gz" />
-                    <element name="reverse" ftype="fastq.gz" value="t_R2.fastq.gz" />
-                </collection>
-            </param>
+            <conditional name="input_type" >
+                <param name="type" value="paired_collection" />
+                <param name="input_readpair" >
+                    <collection type="paired">
+                        <element name="forward" ftype="fastq.gz" value="t_R1.fastq.gz" />
+                        <element name="reverse" ftype="fastq.gz" value="t_R2.fastq.gz" />
+                    </collection>
+                </param>
+            </conditional>
             <param name="barcode_select" value="first_read_only" />
             <param name="bc_pattern"  value="CCCNNNNNNNNXXXXX" />
             <param name="bc_pattern2" value="CCCCCCCCNNNNNNNN" />
@@ -157,6 +168,20 @@
             <output name="out_html_report" file="out_wl_paired.html" />
             <output name="out_thresh" file="out_wl_paired.tresh.tab" />
         </test>
+        <!-- Error report on not accepting regex and lt and gt symbols -->
+        <test expect_num_outputs="3">
+            <conditional name="input_type" >
+                <param name="type" value="single" />
+                <param name="input_single" value="testYYY.40k.fastq.gz" ftype="fastq.gz" />
+            </conditional>
+            <param name="bc_pattern" value="(?P&#60;cell_1&#62;.{8,10})(?P&#60;discard_1&#62;ACTGGCCTGCGA){s&#60;=3}(?P&#60;cell_2&#62;.{9})(?P&#60;discard_2&#62;GGTAGCGGTGACA){s&#60;=3}(?P&#60;cell_3&#62;.{9})(?P&#60;umi_1&#62;.{8})T{3}.*" />
+            <param name="extract_method" value="regex" />
+            <param name="method" value="umis" />
+            <param name="prime3" value="true" />
+            <output name="out_whitelist" file="out_wl_user.single.txt" />
+            <output name="out_thresh" file="out_wl_user.single.tresh.tab" />
+            <output name="out_html_report" file="out_wl_user.single.html" />
+        </test>
     </tests>
     <help><![CDATA[