annotate make_families.xml @ 5:4bc49a5769ee draft

Version 0.5: Split interleaved SSCS output file into two paired files.
author nick
date Thu, 01 Dec 2016 23:22:52 -0500
parents 7f513b9b1b1e
children 9a0bee12b583
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
1 <?xml version="1.0"?>
5
4bc49a5769ee Version 0.5: Split interleaved SSCS output file into two paired files.
nick
parents: 4
diff changeset
2 <tool id="make_families" name="Du Novo: Make families" version="0.5">
2
ba2a53b970ca planemo upload commit 670b3282d2c120882b956ad617e61369467fb0fe
nick
parents: 1
diff changeset
3 <description>of duplex sequencing reads</description>
0
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
4 <requirements>
5
4bc49a5769ee Version 0.5: Split interleaved SSCS output file into two paired files.
nick
parents: 4
diff changeset
5 <requirement type="package" version="0.5">duplex</requirement>
0
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
6 <requirement type="set_environment">DUPLEX_DIR</requirement>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
7 </requirements>
2
ba2a53b970ca planemo upload commit 670b3282d2c120882b956ad617e61369467fb0fe
nick
parents: 1
diff changeset
8 <!-- TODO: Add dependency on coreutils to get paste? -->
0
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
9 <command>paste $fastq1 $fastq2
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
10 | paste - - - -
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
11 | awk -f \$DUPLEX_DIR/make-barcodes.awk -v TAG_LEN=$taglen -v INVARIANT=$invariant
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
12 | sort
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
13 &gt; $output
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
14 </command>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
15 <inputs>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
16 <param name="fastq1" type="data" format="fastq" label="Sequencing reads, mate 1"/>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
17 <param name="fastq2" type="data" format="fastq" label="Sequencing reads, mate 2"/>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
18 <param name="taglen" type="integer" value="12" min="0" label="Tag length" help="length of each random barcode on the ends of the fragments"/>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
19 <param name="invariant" type="integer" value="5" min="0" label="Invariant sequence length" help="length of the sequence between the tag and actual sample sequence (the restriction site, normally)"/>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
20 </inputs>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
21 <outputs>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
22 <data name="output" format="tabular"/>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
23 </outputs>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
24 <tests>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
25 <test>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
26 <param name="fastq1" value="smoke_1.fq"/>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
27 <param name="fastq2" value="smoke_2.fq"/>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
28 <param name="taglen" value="5"/>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
29 <param name="invariant" value="1"/>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
30 <output name="output" file="smoke.families.tsv"/>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
31 </test>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
32 <test>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
33 <param name="fastq1" value="smoke_1.fq"/>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
34 <param name="fastq2" value="smoke_2.fq"/>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
35 <param name="taglen" value="5"/>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
36 <param name="invariant" value="0"/>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
37 <output name="output" file="smoke.families.i0.tsv"/>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
38 </test>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
39 </tests>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
40 <help>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
41
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
42 **What it does**
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
43
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
44 This tool is for processing raw duplex sequencing data, removing the barcodes and grouping by them into families of reads from the same fragment.
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
45
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
46 -----
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
47
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
48 **Output**
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
49
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
50 The output will be a tabular file where each line corresponds to a pair of input reads.
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
51
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
52 The columns are::
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
53
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
54 1: barcode (both tags joined and ordered)
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
55 2: tag order in barcode ("ab" or "ba")
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
56 3: read1 name
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
57 4: read1 sequence (minus the tag and invariant sequences)
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
58 5: read1 quality scores (minus the same tag and invariant)
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
59 6: read2 name
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
60 7: read2 sequence (minus the tag and invariant sequences)
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
61 8: read2 quality scores (minus the same tag and invariant)
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
62
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
63 -----
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
64
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
65 **Barcode creation**
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
66
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
67 For each pair, the tool will remove the tag at the beginning of each read and create a barcode by concatenating the two tags. The order of the tags is determined by a string comparison so that it will make an identical barcode from pairs of either order. The original tag order will be noted in the second column.
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
68
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
69 Since pairs from opposite strands will have the same tags, but in the reverse order, this produces the same barcode for reads from the same fragment, regardless of strand. Then a simple sort will group all reads from the same strand together, separated into strands by the different "order" values.
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
70
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
71 Examples::
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
72
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
73 +---------------+-----------------+
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
74 | input tags | output |
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
75 +-------+-------+-------+---------+
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
76 | read1 | read2 | order | barcode |
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
77 +-------+-------+-------+---------+
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
78 | ATG | CCT | ab | ATGCCT |
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
79 +-------+-------+-------+---------+
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
80 | CCT | ATG | ba | ATGCCT |
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
81 +-------+-------+-------+---------+
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
82
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
83 </help>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
84 </tool>