# HG changeset patch
# User iuc
# Date 1631544691 0
# Node ID 4098ab380097dc8fc3315b7ab9f9b9d589d7ac44
# Parent 7fa28eb10fedc5811fb4aa382ee89fbb0b3d30e9
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/umi_tools commit bf6a3aa532e8f9d122da4c1e39f3e256ae587b79"
diff -r 7fa28eb10fed -r 4098ab380097 macros.xml
--- a/macros.xml Wed Feb 10 19:30:35 2021 +0000
+++ b/macros.xml Mon Sep 13 14:51:31 2021 +0000
@@ -1,5 +1,43 @@
+
+
+
+ 1.1.2
+ 0
+ 21.01
+
+
+ umi_tools
+
+
+
+
+
+ 10.1101/gr.209601.116
+
+ @misc{githubUMI-tools,
+ title = {UMI-tools},
+ publisher = {GitHub},
+ journal = {GitHub repository},
+ url = {https://github.com/CGATOxford/UMI-tools},
+ }
+
+
+
+
+
+
+
+
+
+
+
+
@@ -23,90 +61,510 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ fastqsanger,fastqsanger.gz,fastqillumina,fastqillumina.gz,fastqsolexa,fastqsolexa.gz
+
+
+ umi-tools
+
+
-
-
+
+
-
+
+
-
-
-
+
+
+
+
+
-
-
+
+
+
+
-
-
- 10.1101/gr.209601.116
-
- @misc{githubUMI-tools,
- title = {UMI-tools},
- publisher = {GitHub},
- journal = {GitHub repository},
- url = {https://github.com/CGATOxford/UMI-tools},
- }
-
-
-
-
-
- umi_tools
-
-
-
- 0.5.5
+
+
+
+ 'input.bam' &&
+ samtools index -b 'input.bam' &&
+ #set $input_file = 'input.bam'
+ #else:
+ ln -sf '${input}' 'input.bam' &&
+ ln -sf '$input.metadata.bam_index' 'input.bam.bai' &&
+ #set $input_file = 'input.bam'
+ #end if
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ .{8,12})(?PGAGTGATTGCTTGTGACGCCTT)(?P.{8})(?P.{6})T{3}.*
+
+ Where only reads with a 3' T-tail and `GAGTGATTGCTTGTGACGCCTT` in
+ the correct position to yield two cell barcodes of 8-12 and 8bp
+ respectively, and a 6bp UMI will be retained.
+
+ You can also specify fuzzy matching to allow errors. For example if
+ the discard group above was specified as below this would enable
+ matches with up to 2 errors in the discard_1 group.
+
+ ::
+
+ (?PGAGTGATTGCTTGTGACGCCTT){s<=2}
+
+ Note that all UMIs must be the same length for downstream
+ processing with dedup, group or count commands]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ``,
+replacing with e.g ":".
+
+Alternatively, if your UMIs are encoded in a tag, you can specify this
+by setting the option --extract-umi-method=tag and set the tag name
+with the --umi-tag option. For example, if your UMIs are encoded in
+the 'UM' tag, provide the following options:
+``--extract-umi-method=tag`` ``--umi-tag=UM``
+
+Finally, if you have used umis to extract the UMI +/- cell barcode,
+you can specify ``--extract-umi-method=umis``
+
+The start position of a read is considered to be the start of its alignment
+minus any soft clipped bases. A read aligned at position 500 with
+cigar 2S98M will be assumed to start at position 498.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ = (2* umi B counts) - 1. Each
+ network is a read group.
+
+]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ log
+
+
+
+
diff -r 7fa28eb10fed -r 4098ab380097 test-data/chr19_gene_tags.sam
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/chr19_gene_tags.sam Mon Sep 13 14:51:31 2021 +0000
@@ -0,0 +1,1492 @@
+@HD VN:1.4 SO:queryname
+@SQ SN:chr1 LN:248956422
+@SQ SN:chr2 LN:242193529
+@SQ SN:chr3 LN:198295559
+@SQ SN:chr4 LN:190214555
+@SQ SN:chr5 LN:181538259
+@SQ SN:chr6 LN:170805979
+@SQ SN:chr7 LN:159345973
+@SQ SN:chr8 LN:145138636
+@SQ SN:chr9 LN:138394717
+@SQ SN:chr10 LN:133797422
+@SQ SN:chr11 LN:135086622
+@SQ SN:chr12 LN:133275309
+@SQ SN:chr13 LN:114364328
+@SQ SN:chr14 LN:107043718
+@SQ SN:chr15 LN:101991189
+@SQ SN:chr16 LN:90338345
+@SQ SN:chr17 LN:83257441
+@SQ SN:chr18 LN:80373285
+@SQ SN:chr19 LN:58617616
+@SQ SN:chr20 LN:64444167
+@SQ SN:chr21 LN:46709983
+@SQ SN:chr22 LN:50818468
+@SQ SN:chrX LN:156040895
+@SQ SN:chrY LN:57227415
+@SQ SN:chrM LN:16569
+@SQ SN:GL000008.2 LN:209709
+@SQ SN:GL000009.2 LN:201709
+@SQ SN:GL000194.1 LN:191469
+@SQ SN:GL000195.1 LN:182896
+@SQ SN:GL000205.2 LN:185591
+@SQ SN:GL000208.1 LN:92689
+@SQ SN:GL000213.1 LN:164239
+@SQ SN:GL000214.1 LN:137718
+@SQ SN:GL000216.2 LN:176608
+@SQ SN:GL000218.1 LN:161147
+@SQ SN:GL000219.1 LN:179198
+@SQ SN:GL000220.1 LN:161802
+@SQ SN:GL000221.1 LN:155397
+@SQ SN:GL000224.1 LN:179693
+@SQ SN:GL000225.1 LN:211173
+@SQ SN:GL000226.1 LN:15008
+@SQ SN:KI270302.1 LN:2274
+@SQ SN:KI270303.1 LN:1942
+@SQ SN:KI270304.1 LN:2165
+@SQ SN:KI270305.1 LN:1472
+@SQ SN:KI270310.1 LN:1201
+@SQ SN:KI270311.1 LN:12399
+@SQ SN:KI270312.1 LN:998
+@SQ SN:KI270315.1 LN:2276
+@SQ SN:KI270316.1 LN:1444
+@SQ SN:KI270317.1 LN:37690
+@SQ SN:KI270320.1 LN:4416
+@SQ SN:KI270322.1 LN:21476
+@SQ SN:KI270329.1 LN:1040
+@SQ SN:KI270330.1 LN:1652
+@SQ SN:KI270333.1 LN:2699
+@SQ SN:KI270334.1 LN:1368
+@SQ SN:KI270335.1 LN:1048
+@SQ SN:KI270336.1 LN:1026
+@SQ SN:KI270337.1 LN:1121
+@SQ SN:KI270338.1 LN:1428
+@SQ SN:KI270340.1 LN:1428
+@SQ SN:KI270362.1 LN:3530
+@SQ SN:KI270363.1 LN:1803
+@SQ SN:KI270364.1 LN:2855
+@SQ SN:KI270366.1 LN:8320
+@SQ SN:KI270371.1 LN:2805
+@SQ SN:KI270372.1 LN:1650
+@SQ SN:KI270373.1 LN:1451
+@SQ SN:KI270374.1 LN:2656
+@SQ SN:KI270375.1 LN:2378
+@SQ SN:KI270376.1 LN:1136
+@SQ SN:KI270378.1 LN:1048
+@SQ SN:KI270379.1 LN:1045
+@SQ SN:KI270381.1 LN:1930
+@SQ SN:KI270382.1 LN:4215
+@SQ SN:KI270383.1 LN:1750
+@SQ SN:KI270384.1 LN:1658
+@SQ SN:KI270385.1 LN:990
+@SQ SN:KI270386.1 LN:1788
+@SQ SN:KI270387.1 LN:1537
+@SQ SN:KI270388.1 LN:1216
+@SQ SN:KI270389.1 LN:1298
+@SQ SN:KI270390.1 LN:2387
+@SQ SN:KI270391.1 LN:1484
+@SQ SN:KI270392.1 LN:971
+@SQ SN:KI270393.1 LN:1308
+@SQ SN:KI270394.1 LN:970
+@SQ SN:KI270395.1 LN:1143
+@SQ SN:KI270396.1 LN:1880
+@SQ SN:KI270411.1 LN:2646
+@SQ SN:KI270412.1 LN:1179
+@SQ SN:KI270414.1 LN:2489
+@SQ SN:KI270417.1 LN:2043
+@SQ SN:KI270418.1 LN:2145
+@SQ SN:KI270419.1 LN:1029
+@SQ SN:KI270420.1 LN:2321
+@SQ SN:KI270422.1 LN:1445
+@SQ SN:KI270423.1 LN:981
+@SQ SN:KI270424.1 LN:2140
+@SQ SN:KI270425.1 LN:1884
+@SQ SN:KI270429.1 LN:1361
+@SQ SN:KI270435.1 LN:92983
+@SQ SN:KI270438.1 LN:112505
+@SQ SN:KI270442.1 LN:392061
+@SQ SN:KI270448.1 LN:7992
+@SQ SN:KI270465.1 LN:1774
+@SQ SN:KI270466.1 LN:1233
+@SQ SN:KI270467.1 LN:3920
+@SQ SN:KI270468.1 LN:4055
+@SQ SN:KI270507.1 LN:5353
+@SQ SN:KI270508.1 LN:1951
+@SQ SN:KI270509.1 LN:2318
+@SQ SN:KI270510.1 LN:2415
+@SQ SN:KI270511.1 LN:8127
+@SQ SN:KI270512.1 LN:22689
+@SQ SN:KI270515.1 LN:6361
+@SQ SN:KI270516.1 LN:1300
+@SQ SN:KI270517.1 LN:3253
+@SQ SN:KI270518.1 LN:2186
+@SQ SN:KI270519.1 LN:138126
+@SQ SN:KI270521.1 LN:7642
+@SQ SN:KI270522.1 LN:5674
+@SQ SN:KI270528.1 LN:2983
+@SQ SN:KI270529.1 LN:1899
+@SQ SN:KI270530.1 LN:2168
+@SQ SN:KI270538.1 LN:91309
+@SQ SN:KI270539.1 LN:993
+@SQ SN:KI270544.1 LN:1202
+@SQ SN:KI270548.1 LN:1599
+@SQ SN:KI270579.1 LN:31033
+@SQ SN:KI270580.1 LN:1553
+@SQ SN:KI270581.1 LN:7046
+@SQ SN:KI270582.1 LN:6504
+@SQ SN:KI270583.1 LN:1400
+@SQ SN:KI270584.1 LN:4513
+@SQ SN:KI270587.1 LN:2969
+@SQ SN:KI270588.1 LN:6158
+@SQ SN:KI270589.1 LN:44474
+@SQ SN:KI270590.1 LN:4685
+@SQ SN:KI270591.1 LN:5796
+@SQ SN:KI270593.1 LN:3041
+@SQ SN:KI270706.1 LN:175055
+@SQ SN:KI270707.1 LN:32032
+@SQ SN:KI270708.1 LN:127682
+@SQ SN:KI270709.1 LN:66860
+@SQ SN:KI270710.1 LN:40176
+@SQ SN:KI270711.1 LN:42210
+@SQ SN:KI270712.1 LN:176043
+@SQ SN:KI270713.1 LN:40745
+@SQ SN:KI270714.1 LN:41717
+@SQ SN:KI270715.1 LN:161471
+@SQ SN:KI270716.1 LN:153799
+@SQ SN:KI270717.1 LN:40062
+@SQ SN:KI270718.1 LN:38054
+@SQ SN:KI270719.1 LN:176845
+@SQ SN:KI270720.1 LN:39050
+@SQ SN:KI270721.1 LN:100316
+@SQ SN:KI270722.1 LN:194050
+@SQ SN:KI270723.1 LN:38115
+@SQ SN:KI270724.1 LN:39555
+@SQ SN:KI270725.1 LN:172810
+@SQ SN:KI270726.1 LN:43739
+@SQ SN:KI270727.1 LN:448248
+@SQ SN:KI270728.1 LN:1872759
+@SQ SN:KI270729.1 LN:280839
+@SQ SN:KI270730.1 LN:112551
+@SQ SN:KI270731.1 LN:150754
+@SQ SN:KI270732.1 LN:41543
+@SQ SN:KI270733.1 LN:179772
+@SQ SN:KI270734.1 LN:165050
+@SQ SN:KI270735.1 LN:42811
+@SQ SN:KI270736.1 LN:181920
+@SQ SN:KI270737.1 LN:103838
+@SQ SN:KI270738.1 LN:99375
+@SQ SN:KI270739.1 LN:73985
+@SQ SN:KI270740.1 LN:37240
+@SQ SN:KI270741.1 LN:157432
+@SQ SN:KI270742.1 LN:186739
+@SQ SN:KI270743.1 LN:210658
+@SQ SN:KI270744.1 LN:168472
+@SQ SN:KI270745.1 LN:41891
+@SQ SN:KI270746.1 LN:66486
+@SQ SN:KI270747.1 LN:198735
+@SQ SN:KI270748.1 LN:93321
+@SQ SN:KI270749.1 LN:158759
+@SQ SN:KI270750.1 LN:148850
+@SQ SN:KI270751.1 LN:150742
+@SQ SN:KI270752.1 LN:27745
+@SQ SN:KI270753.1 LN:62944
+@SQ SN:KI270754.1 LN:40191
+@SQ SN:KI270755.1 LN:36723
+@SQ SN:KI270756.1 LN:79590
+@SQ SN:KI270757.1 LN:71251
+@SQ SN:ERCC-00002 LN:1061
+@SQ SN:ERCC-00003 LN:1023
+@SQ SN:ERCC-00004 LN:523
+@SQ SN:ERCC-00009 LN:984
+@SQ SN:ERCC-00012 LN:994
+@SQ SN:ERCC-00013 LN:808
+@SQ SN:ERCC-00014 LN:1957
+@SQ SN:ERCC-00016 LN:844
+@SQ SN:ERCC-00017 LN:1136
+@SQ SN:ERCC-00019 LN:644
+@SQ SN:ERCC-00022 LN:751
+@SQ SN:ERCC-00024 LN:536
+@SQ SN:ERCC-00025 LN:1994
+@SQ SN:ERCC-00028 LN:1130
+@SQ SN:ERCC-00031 LN:1138
+@SQ SN:ERCC-00033 LN:2022
+@SQ SN:ERCC-00034 LN:1019
+@SQ SN:ERCC-00035 LN:1130
+@SQ SN:ERCC-00039 LN:740
+@SQ SN:ERCC-00040 LN:744
+@SQ SN:ERCC-00041 LN:1122
+@SQ SN:ERCC-00042 LN:1023
+@SQ SN:ERCC-00043 LN:1023
+@SQ SN:ERCC-00044 LN:1156
+@SQ SN:ERCC-00046 LN:522
+@SQ SN:ERCC-00048 LN:992
+@SQ SN:ERCC-00051 LN:274
+@SQ SN:ERCC-00053 LN:1023
+@SQ SN:ERCC-00054 LN:274
+@SQ SN:ERCC-00057 LN:1021
+@SQ SN:ERCC-00058 LN:1136
+@SQ SN:ERCC-00059 LN:525
+@SQ SN:ERCC-00060 LN:523
+@SQ SN:ERCC-00061 LN:1136
+@SQ SN:ERCC-00062 LN:1023
+@SQ SN:ERCC-00067 LN:644
+@SQ SN:ERCC-00069 LN:1137
+@SQ SN:ERCC-00071 LN:642
+@SQ SN:ERCC-00073 LN:603
+@SQ SN:ERCC-00074 LN:522
+@SQ SN:ERCC-00075 LN:1023
+@SQ SN:ERCC-00076 LN:642
+@SQ SN:ERCC-00077 LN:273
+@SQ SN:ERCC-00078 LN:993
+@SQ SN:ERCC-00079 LN:644
+@SQ SN:ERCC-00081 LN:534
+@SQ SN:ERCC-00083 LN:1022
+@SQ SN:ERCC-00084 LN:994
+@SQ SN:ERCC-00085 LN:844
+@SQ SN:ERCC-00086 LN:1020
+@SQ SN:ERCC-00092 LN:1124
+@SQ SN:ERCC-00095 LN:521
+@SQ SN:ERCC-00096 LN:1107
+@SQ SN:ERCC-00097 LN:523
+@SQ SN:ERCC-00098 LN:1143
+@SQ SN:ERCC-00099 LN:1350
+@SQ SN:ERCC-00104 LN:2022
+@SQ SN:ERCC-00108 LN:1022
+@SQ SN:ERCC-00109 LN:536
+@SQ SN:ERCC-00111 LN:994
+@SQ SN:ERCC-00112 LN:1136
+@SQ SN:ERCC-00113 LN:840
+@SQ SN:ERCC-00116 LN:1991
+@SQ SN:ERCC-00117 LN:1136
+@SQ SN:ERCC-00120 LN:536
+@SQ SN:ERCC-00123 LN:1022
+@SQ SN:ERCC-00126 LN:1118
+@SQ SN:ERCC-00130 LN:1059
+@SQ SN:ERCC-00131 LN:771
+@SQ SN:ERCC-00134 LN:274
+@SQ SN:ERCC-00136 LN:1033
+@SQ SN:ERCC-00137 LN:537
+@SQ SN:ERCC-00138 LN:1024
+@SQ SN:ERCC-00142 LN:493
+@SQ SN:ERCC-00143 LN:784
+@SQ SN:ERCC-00144 LN:538
+@SQ SN:ERCC-00145 LN:1042
+@SQ SN:ERCC-00147 LN:1023
+@SQ SN:ERCC-00148 LN:494
+@SQ SN:ERCC-00150 LN:743
+@SQ SN:ERCC-00154 LN:537
+@SQ SN:ERCC-00156 LN:494
+@SQ SN:ERCC-00157 LN:1019
+@SQ SN:ERCC-00158 LN:1027
+@SQ SN:ERCC-00160 LN:743
+@SQ SN:ERCC-00162 LN:523
+@SQ SN:ERCC-00163 LN:543
+@SQ SN:ERCC-00164 LN:1022
+@SQ SN:ERCC-00165 LN:872
+@SQ SN:ERCC-00168 LN:1024
+@SQ SN:ERCC-00170 LN:1023
+@SQ SN:ERCC-00171 LN:505
+@PG ID:STAR PN:STAR VN:STAR_2.5.2b CL:STAR --runThreadN 8 --genomeDir /data/home/mvanloenhout/Gencode_v25/Star_overhang69/ --readFilesIn /data/home/mvanloenhout/WTF2/scRNA_Analysis/Processed_data/HSC2-I02_S5_R2_001.fastq.gz --readFilesCommand gunzip -c --outFileNamePrefix /data/home/mvanloenhout/WTF2/scRNA_Analysis/Aligned_files/HSC2-I02_S5_R2_001 --outSAMtype BAM SortedByCoordinate --outSAMmultNmax 1 --outFilterType BySJout --outFilterMultimapNmax 20
+@CO user command line: STAR --runThreadN 8 --genomeDir /data/home/mvanloenhout/Gencode_v25/Star_overhang69/ --outSAMtype BAM SortedByCoordinate --outSAMmultNmax 1 --outFilterMultimapNmax 20 --outFilterType BySJout --outFileNamePrefix /data/home/mvanloenhout/WTF2/scRNA_Analysis/Aligned_files/HSC2-I02_S5_R2_001 --readFilesCommand gunzip -c --readFilesIn /data/home/mvanloenhout/WTF2/scRNA_Analysis/Processed_data/HSC2-I02_S5_R2_001.fastq.gz
+NS500668:144:H5FCJBGXY:1:11102:10920:18759:CELL_TTCACG:UMI_TTGGGA:SAMPLE_CGATGT:UID_CGATGTTTCACGTTGGGA 0 chr19 812244 255 51M9S * 0 0 CGCTGTGGACTCTGTAGAGGCAGGTTGGCCAGTCTGTACCTGGACTTCGAANNNNNNNNN AAAA/A//EE/AA/EEEA//EE////' mode='w' encoding='UTF-8'>
-# stdin : <_io.TextIOWrapper name='/tmp/tmpibtvD6/files/000/dataset_5.dat' mode='r' encoding='UTF-8'>
-# stdlog : <_io.TextIOWrapper name='/tmp/tmpibtvD6/files/000/dataset_8.dat' mode='a' encoding='UTF-8'>
-# stdout : <_io.TextIOWrapper name='' mode='w' encoding='UTF-8'>
+# stderr : <_io.TextIOWrapper name='' mode='w' encoding='utf-8'>
+# stdin : <_io.TextIOWrapper name='input_read1.gz' encoding='ascii'>
+# stdlog : <_io.TextIOWrapper name='/tmp/tmpcx2d26we/files/0/0/8/dataset_008b1843-bfa2-44fb-9d3c-52695bd9ce74.dat' mode='a' encoding='UTF-8'>
+# stdout : <_io.TextIOWrapper name='' mode='w' encoding='utf-8'>
# subset_reads : 0
# timeit_file : None
# timeit_header : None
# timeit_name : all
+# tmpdir : None
# whitelist_tsv : None
-2018-02-25 10:50:16,016 INFO Starting barcode extraction
-2018-02-25 10:50:16,017 INFO Parsed 0 reads
-2018-02-25 10:50:16,019 INFO Starting - whitelist determination
-2018-02-25 10:50:17,208 INFO Finished - whitelist determination
-2018-02-25 10:50:17,208 INFO Starting - finding putative error cell barcodes
-2018-02-25 10:50:17,208 INFO Finished - finding putative error cell barcodes
-2018-02-25 10:50:17,208 INFO Writing out whitelist
-2018-02-25 10:50:17,208 INFO Parsed 100 reads
-2018-02-25 10:50:17,208 INFO 100 reads matched the barcode pattern
-2018-02-25 10:50:17,208 INFO Found 23 unique cell barcodes
-# job finished in 1 seconds at Sun Feb 25 10:50:17 2018 -- 2.35 0.08 0.00 0.00 -- e78e4e5b-e99e-426a-8a92-c8b3beeadf18
+2021-07-13 15:21:12,587 INFO Starting barcode extraction
+2021-07-13 15:21:12,588 INFO Parsed 0 reads
+2021-07-13 15:21:12,590 INFO Starting - whitelist determination
+2021-07-13 15:21:14,249 INFO Finished - whitelist determination
+2021-07-13 15:21:14,249 INFO Starting - finding putative error cell barcodes
+2021-07-13 15:21:14,249 INFO building bktree
+2021-07-13 15:21:14,249 INFO done building bktree
+2021-07-13 15:21:14,249 INFO Finished - finding putative error cell barcodes
+2021-07-13 15:21:14,249 INFO Top 1 cell barcodes passed the selected threshold
+2021-07-13 15:21:14,249 INFO Writing out whitelist
+2021-07-13 15:21:14,249 INFO Parsed 100 reads
+2021-07-13 15:21:14,249 INFO 100 reads matched the barcode pattern
+2021-07-13 15:21:14,249 INFO Found 23 unique cell barcodes
+2021-07-13 15:21:14,249 INFO Found 15 total reads matching the selected cell barcodes
+2021-07-13 15:21:14,249 INFO Found 85 total reads which can be error corrected to the selected cell barcodes
+# job finished in 1 seconds at Tue Jul 13 15:21:14 2021 -- 7.19 0.62 0.08 0.02 -- ba3841c0-b2d5-4188-88ca-4ee241163293
diff -r 7fa28eb10fed -r 4098ab380097 umi-tools_dedup.xml
--- a/umi-tools_dedup.xml Wed Feb 10 19:30:35 2021 +0000
+++ b/umi-tools_dedup.xml Mon Sep 13 14:51:31 2021 +0000
@@ -1,128 +1,157 @@
-
+Extract UMI from fastq files
+ macros.xml
- samtools
+ samtools 0:
- --gene-tag '$gene_tag'
- #end if
- #if $input.is_of_type("sam"):
- --in-sam
- #end if
- -I '$input_file' -S deduped.bam &&
- samtools sort deduped.bam -@ \${GALAXY_SLOTS:-1} -T "\${TMPDIR:-.}" -o '$output' -O BAM
+ @GROUPDEDUP_OPTIONS@
+ @BARCODE_OPTIONS@
+ @UMI_GROUPING_OPTIONS@
+ @SAMBAM_OPTIONS@
+ @FULLSC_OPTIONS@
+ @ADVANCED_OPTIONS@
+ -I '$input_file' -S deduped.bam
+ ## TODO using samtools sort is a workaround, for the following error that appears when Galaxy
+ ## compares the generated file with the one in test-data
+ ## `Converting history BAM to SAM failed: 'samtools returned with error 1: stdout=None, stderr=[main_samview] fail to read the header from "/tmp/tmpd8o61jykdedup_out6.bam".\n'. Will compare BAM files`
+ ## problem seems to be the BAM file generated with pysam
+ ## may be dropped in the future
+ --no-sort-output
+ @LOG@
+ && samtools sort --no-PG deduped.bam -@ \${GALAXY_SLOTS:-1} -T "\${TMPDIR:-.}" -o '$output' -O BAM
+
]]>
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
-
+ output_stats_bool
+
-
-
-
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
+
+
+
+
+
+
+
+
+
+
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
-
-
-
+
+
+
+
+
+
+
+
+
+
-
+
@@ -131,183 +160,100 @@
",
-replacing with e.g ":".
+@BARCODE_HELP@
-Alternatively, if your UMIs are encoded in a tag, you can specify this
-by setting the option --extract-umi-method=tag and set the tag name
-with the --umi-tag option. For example, if your UMIs are encoded in
-the 'UM' tag, provide the following options:
-"--extract-umi-method=tag --umi-tag=UM"
+@UMI_GROUPING_HELP@
-The start postion of a read is considered to be the start of its alignment
-minus any soft clipped bases. A read aligned at position 500 with
-cigar 2S98M will be assumed to start at postion 498.
-
-
-Methods
--------
+Selecting the representative read
+---------------------------------
+For every group of duplicate reads, a single representative read is
+retained.The following criteria are applied to select the read that
+will be retained from a group of duplicated reads:
-dedup can be run with multiple methods to identify groups of reads with
-the same (or similar) UMI(s). All methods start by identifying the
-reads with the same mapping position.
+1. The read with the lowest number of mapping coordinates (see
+``--multimapping-detection-method`` option)
-The simpliest method, "unique", groups reads with the exact same
-UMI. The network-based methods, "cluster", "adjacency" and
-"directional", build networks where nodes are UMIs and edges connect
-UMIs with an edit distance <= threshold (usually 1). The groups of
-reads are then defined from the network in a method-specific manner.
+2. The read with the highest mapping quality. Note that this is not
+the read sequencing quality and that if two reads have the same
+mapping quality then one will be picked at random regardless of the
+read quality.
- "unique"
- Reads group share the exact same UMI
+Otherwise a read is chosen at random.
- "percentile"
- Reads group share the exact same UMI. UMIs with counts < 1% of the
- median counts for UMIs at the same position are ignored.
-
- "cluster"
- Identify clusters of connected UMIs (based on hamming distance
- threshold). Each network is a read group
+Optional statistics output
+--------------------------
- "adjacency"
- Cluster UMIs as above. For each cluster, select the node(UMI)
- with the highest counts. Visit all nodes one edge away. If all
- nodes have been visted, stop. Otherise, repeat with remaining
- nodes until all nodes have been visted. Each step
- defines a read group.
+One can use the edit distance between UMIs at the same position as an
+quality control for the deduplication process by comparing with
+a null expectation of random sampling. For the random sampling, the
+observed frequency of UMIs is used to more reasonably model the null
+expectation.
- "directional" (default)
- Identify clusters of connected UMIs (based on hamming distance
- threshold) and umi A counts >= (2* umi B counts) - 1. Each
- network is a read group.
+Use the option ``Output UMI related statistics files?`` generate stats outfiles:
-Options
--------
-
---extract-umi-method (choice)
- How are the UMIs encoded in the read?
-
- Options are:
-
- - "read_id" (default)
- UMIs contained at the end of the read separated as
- specified with --umi-separator option
-
- - "tag"
- UMIs contained in a tag, see --umi-tag option
-
---umi-separator (string)
- Separator between read id and UMI. See --extract-umi-method above
-
---umi-tag (string)
- Tag which contains UMI. See --extract-umi-method above
+edit_distance
+ Reports the (binned) average edit distance between the UMIs at each
+ position. Positions with a single UMI are reported seperately. The
+ edit distances are reported pre- and post-deduplication alongside
+ the null expectation from random sampling of UMIs from the UMIs
+ observed across all positions. Note that separate null
+ distributions are reported since the null depends on the observed
+ frequency of each UMI which is different pre- and
+ post-deduplication. The post-duplication values should be closer to
+ their respective null than the pre-deduplication vs null comparison
---edit-distance-threshold (int)
- For the adjacency and cluster methods the threshold for the
- edit distance to connect two UMIs in the network can be
- increased. The default value of 1 works best unless the UMI is
- very long (>14bp)
-
---paired
- BAM is paired end - output both read pairs. This will also
- force the use of the template length to determine reads with
- the same mapping coordinates.
-
---spliced-is-unique
- Causes two reads that start in the same position on the same
- strand and having the same UMI to be considered unique if one is
- spliced and the other is not. (Uses the 'N' cigar operation to test
- for splicing)
+In addition, this option will trigger reporting of further summary
+statistics for the UMIs which may be informative for selecting the
+optimal deduplication method or debugging.
---soft-clip-threshold (int)
- Mappers that soft clip, will sometimes do so rather than mapping a
- spliced read if there is only a small overhang over the exon
- junction. By setting this option, you can treat reads with at least
- this many bases soft-clipped at the 3' end as spliced.
-
---multimapping-detection-method (string, choice)
- If the sam/bam contains tags to identify multimapping reads, you can
- specify for use when selecting the best read at a given loci.
- Supported tags are "NH", "X0" and "XT". If not specified, the read
- with the highest mapping quality will be selected
+Each unique UMI sequence may be observed [0-many] times at multiple
+positions in the BAM. The following files report the distribution for
+the frequencies of each UMI.
---read-length
- Use the read length as as a criteria when deduping, for e.g sRNA-Seq
-
---whole-contig
- Consider all alignments to a single contig together. This is useful if
- you have aligned to a transcriptome multi-fasta
-
---subset (float, [0-1])
- Only consider a fraction of the reads, chosen at random. This is useful
- for doing saturation analyses.
-
---chrom
- Only consider a single chromosome. This is useful for debugging purposes
+per_umi_per_position
+ The `_stats_per_umi_per_position.tsv` file simply tabulates the
+ counts for unique combinations of UMI and position. E.g if prior to
+ deduplication, we have two positions in the BAM (POSa, POSb), at
+ POSa we have observed 2*UMIa, 1*UMIb and at POSb: 1*UMIc, 3*UMId,
+ then the stats file is populated thus:
---per-contig (string)
- Deduplicate per contig (field 3 in BAM; RNAME).
- All reads with the same contig will be
- considered to have the same alignment position. This is useful
- if your library prep generates PCR duplicates with non identical
- alignment positions such as CEL-Seq. In this case, you would
- align to a reference transcriptome with one transcript per gene
-
---per-gene (string)
- Deduplicate per gene. As above except with this option you can
- align to a reference transcriptome with more than one transcript
- per gene. You need to also provide --gene-transcript-map option.
- This will also add a metacontig ('MC') tag to the reads if used
- in conjunction with --output-bam
-
---gene-transcript-map (string)
- File mapping genes to transripts (tab separated), e.g:
-
- gene1 transcript1
- gene1 transcript2
- gene2 transcript3
+ ====== =============
+ counts instances_pre
+ ------ -------------
+ 1 2
+ 2 1
+ 3 1
+ ====== =============
+
+ If post deduplication, UMIb is grouped with UMIa such that POSa:
+ 3*UMIa, then the `instances_post` column is populated thus:
---gene-tag (string)
- Deduplicate per gene. As per --per-gene except here the gene
- information is encoded in the bam read tag specified so you do
- not need to supply --gene-transcript-map
-
---output-bam (string, filename)
- Output a tagged bam file to stdout or -S
-
--i, --in-sam/-o, --out-sam
- By default, inputs are assumed to be in BAM format and output are output
- in BAM format. Use these options to specify the use of SAM format for
- inputs or outputs.
+ ====== ============= ==============
+ counts instances_pre instances_post
+ ------ ------------- --------------
+ 1 2 1
+ 2 1 0
+ 3 1 2
+ ====== ============= ==============
--I (string, filename) input file name
- The input file must be sorted and indexed.
-
--S (string, filename) output file name
+per_umi_per
+ The `_stats_per_umi_per.tsv` table provides UMI-level summary
+ statistics. Keeping in mind that each unique UMI sequence can be
+ observed at [0-many] times across multiple positions in the BAM,
--L (string, filename) log file name
+ :times_observed: How many positions the UMI was observed at
+ :total_counts: The total number of times the UMI was observed across all positions
+ :median_counts: The median for the distribution of how often the UMI was observed at each position (excluding zeros)
-Usage
------
- umi_tools dedup -I infile.bam -S grouped.bam --
-
- ]]>
+ Hence, whenever times_observed=1, total_counts==median_counts.]]>