annotate make_families.xml @ 0:d2e46adc199e draft

planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
author nick
date Mon, 23 Nov 2015 22:06:21 -0500
parents
children b63d6673f883
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
1 <?xml version="1.0"?>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
2 <tool id="make_families" name="Make families" version="0.1">
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
3 <description>from duplex sequencing data</description>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
4 <requirements>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
5 <requirement type="package" version="0.1">duplex</requirement>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
6 <requirement type="set_environment">DUPLEX_DIR</requirement>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
7 </requirements>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
8 <command>paste $fastq1 $fastq2
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
9 | paste - - - -
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
10 | awk -f \$DUPLEX_DIR/make-barcodes.awk -v TAG_LEN=$taglen -v INVARIANT=$invariant
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
11 | sort
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
12 &gt; $output
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
13 </command>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
14 <inputs>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
15 <param name="fastq1" type="data" format="fastq" label="Sequencing reads, mate 1"/>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
16 <param name="fastq2" type="data" format="fastq" label="Sequencing reads, mate 2"/>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
17 <param name="taglen" type="integer" value="12" min="0" label="Tag length" help="length of each random barcode on the ends of the fragments"/>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
18 <param name="invariant" type="integer" value="5" min="0" label="Invariant sequence length" help="length of the sequence between the tag and actual sample sequence (the restriction site, normally)"/>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
19 </inputs>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
20 <outputs>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
21 <data name="output" format="tabular"/>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
22 </outputs>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
23 <tests>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
24 <test>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
25 <param name="fastq1" value="smoke_1.fq"/>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
26 <param name="fastq2" value="smoke_2.fq"/>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
27 <param name="taglen" value="5"/>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
28 <param name="invariant" value="1"/>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
29 <output name="output" file="smoke.families.tsv"/>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
30 </test>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
31 <test>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
32 <param name="fastq1" value="smoke_1.fq"/>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
33 <param name="fastq2" value="smoke_2.fq"/>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
34 <param name="taglen" value="5"/>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
35 <param name="invariant" value="0"/>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
36 <output name="output" file="smoke.families.i0.tsv"/>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
37 </test>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
38 </tests>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
39 <help>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
40
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
41 **What it does**
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
42
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
43 This tool is for processing raw duplex sequencing data, removing the barcodes and grouping by them into families of reads from the same fragment.
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
44
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
45 -----
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
46
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
47 **Output**
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
48
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
49 The output will be a tabular file where each line corresponds to a pair of input reads.
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
50
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
51 The columns are::
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
52
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
53 1: barcode (both tags joined and ordered)
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
54 2: tag order in barcode ("ab" or "ba")
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
55 3: read1 name
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
56 4: read1 sequence (minus the tag and invariant sequences)
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
57 5: read1 quality scores (minus the same tag and invariant)
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
58 6: read2 name
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
59 7: read2 sequence (minus the tag and invariant sequences)
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
60 8: read2 quality scores (minus the same tag and invariant)
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
61
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
62 -----
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
63
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
64 **Barcode creation**
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
65
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
66 For each pair, the tool will remove the tag at the beginning of each read and create a barcode by concatenating the two tags. The order of the tags is determined by a string comparison so that it will make an identical barcode from pairs of either order. The original tag order will be noted in the second column.
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
67
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
68 Since pairs from opposite strands will have the same tags, but in the reverse order, this produces the same barcode for reads from the same fragment, regardless of strand. Then a simple sort will group all reads from the same strand together, separated into strands by the different "order" values.
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
69
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
70 Examples::
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
71
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
72 +---------------+-----------------+
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
73 | input tags | output |
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
74 +-------+-------+-------+---------+
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
75 | read1 | read2 | order | barcode |
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
76 +-------+-------+-------+---------+
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
77 | ATG | CCT | ab | ATGCCT |
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
78 +-------+-------+-------+---------+
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
79 | CCT | ATG | ba | ATGCCT |
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
80 +-------+-------+-------+---------+
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
81
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
82 </help>
d2e46adc199e planemo upload commit 35b743e6492923c0e2b1e5e434eaf4e56d268108
nick
parents:
diff changeset
83 </tool>