comparison csem/csem.xml @ 0:a8f2c2a5f11b

Uploaded
author dongjun
date Mon, 12 Sep 2011 09:54:50 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:a8f2c2a5f11b
1 <tool id="csem" name="CSEM: Multi-read Allocation for ChIP-seq" version="1.0.0">
2
3 <description></description>
4
5 <parallelism method="basic"></parallelism>
6
7 <requirements>
8 <requirement type="binary">csem</requirement>
9 <requirement type="package">bowtie</requirement>
10 </requirements>
11
12 <command interpreter="perl">
13 csem_wrapper.pl
14 ## Input file name
15 $InputParams.Input
16 ## Input file format (FASTA or FASTQ)
17 $InputParams.InfileFormat
18 ## Output file name
19 $out_csem
20 ## Output file format
21 $OutfileFormat
22 ## Reference genome idnex for Bowtie
23 $index.fields.path$index
24 ## Generate pseudo-tags?
25 $pseudoTag
26 ## Bowtie settings (Max num of mismatches, Max num of aligned positions)
27 #if $bowtieParams.bSettingsType == "preSet"
28 2
29 99
30 #else
31 $bowtieParams.Mismatch
32 $bowtieParams.SuppressAlign
33 #end if
34 ## CSEM settings (window size, number of iterations)
35 #if $csemParams.cSettingsType == "preSet"
36 101
37 200
38 #else
39 $csemParams.windowSize
40 $csemParams.nIteration
41 #end if
42 ## Number of cores to use
43 8
44 </command>
45
46 <inputs>
47 <param name="index" type="select" label="Select a reference genome" help="If your genome of interest is not listed - contact Galaxy team.">
48 <options from_data_table="bowtie_indexes">
49 <filter type="sort_by" column="2" />
50 <validator type="no_options" message="No indexes are available" />
51 </options>
52 </param>
53 <conditional name="InputParams">
54 <param name="InfileFormat" type="select" label="Select file format to process" help="Bowtie accepts FASTA or FASTQ file formats.">
55 <option value="fasta">FASTA</option>
56 <option value="fastq">FASTQ</option>
57 </param>
58 <when value="fasta">
59 <param name="Input" type="data" format="fasta" label="FASTA file"/>
60 </when>
61 <when value="fastq">
62 <param name="Input" type="data" format="fastq,fastqsanger,fastqillumina,fastqsolexa" label="FASTQ file"/>
63 </when>
64 </conditional> <!-- InputParams -->
65 <param name="OutfileFormat" type="select" label="Select file format to export" help="Multi-read allocator can export results into BED or GFF file formats, or as a table.">
66 <option value="bed">BED</option>
67 <option value="gff">GFF</option>
68 <option value="table">table</option>
69 </param>
70 <param name="pseudoTag" type="select" label="Generate pseudo-tags?" help="See section 'Pseudo-tags' in the help below for more details.">
71 <option value="N">NO</option>
72 <option value="Y">YES</option>
73 </param>
74 <conditional name="bowtieParams">
75 <param name="bSettingsType" type="select" label="Bowtie settings to use" help="For most mapping applications, use the 'Commonly used' settings. If you want full control, use 'Full parameter list'.">
76 <option value="preSet">Commonly used</option>
77 <option value="full">Full parameter list</option>
78 </param>
79 <when value="preSet" />
80 <when value="full">
81 <param name="Mismatch" type="integer" value="2" label="Maximum number of mismatches permitted (-v)" help="May be 0, 1, 2, or 3." />
82 <param name="SuppressAlign" type="integer" value="99" label="Suppress all alignments for a read if more than n reportable alignments exist (-m)" help="99 is appropriate for most cases. Use -1 for no limit." />
83 </when> <!-- full -->
84 </conditional> <!-- bowtieParams -->
85 <conditional name="csemParams">
86 <param name="cSettingsType" type="select" label="CSEM settings to use" help="For most multi-read allocation applications, use the 'Commonly used' settings. If you want full control, use 'Full parameter list'.">
87 <option value="preSet">Commonly used</option>
88 <option value="full">Full parameter list</option>
89 </param>
90 <when value="preSet" />
91 <when value="full">
92 <param name="windowSize" type="integer" value="101" label="Window size for the multi-read allocator" help="Set window size to some odd number close to the half of average fragment length." />
93 <param name="nIteration" type="integer" value="200" label="Number of iterations for the multi-read allocator" help="200 is appropriate for most cases." />
94 </when> <!-- full -->
95 </conditional> <!-- csemParams -->
96 </inputs>
97
98 <outputs>
99 <data format="tabular" name="out_csem">
100 <change_format>
101 <when input="OutfileFormat" value="bed" format="bed" />
102 <when input="OutfileFormat" value="gff" format="gff" />
103 </change_format>
104 </data>
105 </outputs>
106
107 <tests>
108 <test>
109 <param name="index" value="eschColi_K12" />
110 <param name="InfileFormat" value="fasta" />
111 <param name="Input" ftype="fasta" value="csem_test1_in.fa" />
112 <param name="OutfileFormat" value="bed" />
113 <param name="pseudoTag" value="N" />
114 <param name="bSettingsType" value="preSet" />
115 <param name="cSettingsType" value="preSet" />
116 <output name="out_csem" ftype="bed" file="csem_test1_out_original_sorted.bed" sort="True" />
117 </test>
118 <test>
119 <param name="index" value="eschColi_K12" />
120 <param name="InfileFormat" value="fastq" />
121 <param name="Input" ftype="fastq" value="csem_test1_in.fq" />
122 <param name="OutfileFormat" value="bed" />
123 <param name="pseudoTag" value="Y" />
124 <param name="bSettingsType" value="preSet" />
125 <param name="cSettingsType" value="preSet" />
126 <output name="out_csem" ftype="bed" file="csem_test1_out_pseudo_sorted.bed" sort="True" />
127 </test>
128 </tests>
129
130 <help>
131
132 **What it does**
133
134 CSEM (ChIP-Seq multi-read allocation using E-M algorithm) is a multi-read allocation algorithm. *Multi-reads* are the reads that map to multiple locations on the reference genome. Most common analysis of ChIP-seq data relies on using only reads that map uniquely to relevant reference genome (*uni-reads*). This can lead to the omission of up to 30 % of alignable reads. Chung et al. (2011) illustrated that incorporation of multi-reads significantly increases sequencing depths, leads to detection of novel peaks that are not otherwise identifiable with uni-reads, and improves detection of peaks in low mappable regions. The computational and experimental results established that multi-reads can be of critical importance for studying DNA-protein interactions in highly repetitive regions of genomes with ChIP-seq experiments. Output from CSEM can be used with other peak callers such as MOSAiCS and MACS to identify peaks that are in both high and low mappable regions of genomes.
135
136 Please cite: Chung D, Kuan PF, Li B, SanalKumar R, Liang K, Bresnick E, Dewey C, and Keles S (2011),
137 "Discovering transcription factor binding sites in highly repetitive regions of genomes
138 with multi-read analysis of ChIP-Seq data," PLoS Computational Biology, 7(7): e1002111.
139
140 ------
141
142 **Input formats**
143
144 CSEM accepts short reads aligned using bowtie as input. Bowtie accepts single-end reads, in FASTA or FASTQ format, as input. Quality scores of reads are ignored.
145
146 ------
147
148 **Pseudo-tags**
149
150 For each read in the alignment file, CSEM estimates the fraction of the read allocated to each of its alignments. This fraction reflects the degree of confidence in each particular alignment. Currently, only the peak caller MOSAiCS can accept fractional of reads as input. However, you can incorporate multi-reads into ChIP-seq analysis with your favoriate peak-caller by utilizing this pseudo-tag functionality. Pseudo-tags are generated by assigning each multi-read to the location it maps to with the largest weight and filtering out multi-reads with weights less than 0.5. Although summarizing CSEM output as pseudo-tags decreases the number of utilized multi-reads, it still leads to a significant increase in the sequencing depth compared to using uni-reads alone and facilitates identification of peaks in repetitive regions.
151
152 ------
153
154 **Outputs**
155
156 Currently, results from CSEM can be exported into BED or GFF file formats, or as a table. Each line of the output file specifies a single alignment. The lines of the output file are ordered such that all of the unique read alignments appear first. If pseudo-tags are generated, *FRAC* equals to 1 for all reads if the output is a table and *score* is set to 1000 for all the reads in the BED and GFF formats.
157
158 If the output is a table, it has the following columns::
159
160 Column Description
161 -------- --------------------------------------------------------
162 1 RID ID of a read
163 2 CID Chromosome of the alignment
164 3 DIR Strand of the alignment (+ or -)
165 4 POS Left-most position of the aligned read (the first base in a chromosome is numbered 1)
166 5 FRAC Fraction of the read allocated to the alignment (which is 1 for uni-reads)
167
168 If the output is in BED format, it has the following columns::
169
170 Column Description
171 ------------ --------------------------------------------------------
172 1 chrom Chromosome of the alignment
173 2 chromStart Start position of the aligned read (the first base in a chromosome is numbered 0)
174 3 chromEnd End position of the aligned read (the first base in a chromosome is numbered 0)
175 4 name ID of a read
176 5 score 1000 * fraction of the read allocated to the alignment (which is 1000 for uni-reads)
177 6 strand Strand of the alignment (+ or -)
178
179 If the output is in GFF format, it has the following columns::
180
181 Column Description
182 --------- --------------------------------------------------------
183 1 seqname Chromosome of the alignment
184 2 source Always "CSEM"
185 3 feature ID of a read
186 4 start Start position of the aligned read (the first base in a chromosome is numbered 1)
187 5 end End position of the aligned read (the first base in a chromosome is numbered 1)
188 6 score 1000 * fraction of the read allocated to the alignment (which is 1000 for uni-reads)
189 7 strand Strand of the alignment (+ or -)
190 8 frame Always "."
191 9 group Always "."
192
193
194 </help>
195 </tool>