comparison pca_pipeline_def.xml @ 0:64e75e21466e draft default tip

Uploaded
author pmac
date Wed, 01 Jun 2016 03:38:39 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:64e75e21466e
1 <tool id="pca_pipeline" name="PCA Pipeline" version="1.0.0">
2 <description>Iterative PCA pipeline</description>
3 <requirements>
4 <requirement type="package" version="2.8">Jinja2</requirement>
5 <!--
6 <requirement type="package" version="3.2.1">R</requirement>
7 <requirement type="package" version="1.2.5">flashpcaR</requirement>
8 -->
9 </requirements>
10 <command interpreter="python">
11 <![CDATA[
12 iterative_pca.py
13 $datafile
14 $data_type
15 $iterations
16 --sd_cutoff $sd_cutoff
17 --absolute_prefix $output.files_path
18 --html_file $output
19 #if $control_tag
20 --control_tag $control_tag
21 #end if
22 #if $cases_tag
23 --cases_tag $cases_tag
24 #end if
25 #if $data_type.value == "variant_data"
26 #if $user_configfile
27 --config_file $user_configfile
28 #else
29 --config_file $cfile
30 #end if
31 #end if
32 #if $clustering_flag.value == "yes"
33 --clustering_flag
34 --clustering_method $clustering_method
35 --cluster_trimming $cluster_trimming
36 #end if
37 #if $ethnicity_file
38 --ethnicity_file $ethnicity_file
39 #end if
40 #if $xsamples_file
41 --reject_samples $xsamples_file
42 #end if
43 #if $xsnps_file
44 --reject_snps $xsnps_file
45 #end if
46 --galaxy
47 ]]>
48 </command>
49 <configfiles>
50 <configfile name="cfile">#control
51 control_tag,#Sample,$control_tag
52 cases_tag,#Sample,$cases_tag
53 #column_names
54 genotype_column,$genotype_column
55 reference_column,$reference_column
56 alternate_column,$alternate_column
57 sample_id_column,$sample_id_column
58 chromosome_column,$chromosome_column
59 position_column,$position_column
60 variant_id_column,$variant_id_column
61 #numeric_filters
62 #for $i, $f in enumerate($numeric_filters)
63 $f.filter_name,$f.column_name,$f.operation,$f.cutoff
64 #end for
65 #string_filters
66 #for $i, $s in enumerate($string_filters)
67 $s.filter_name,$s.column_name,$s.exact_flag,$s.accept_flag
68 $(','.join($s.patterns.split('\n')))
69 #end for</configfile>
70 </configfiles>
71 <inputs>
72 <param name="datafile" type="data" label="Input datafile"/>
73 <param name="data_type" type="select" label="Type of input data file">
74 <option value="variant_data">Variant Data</option>
75 <option value="numeric_ped">Numeric PED File</option>
76 <option value="rdata">RData file</option>
77 </param>
78 <param name="iterations" type="integer" value="1" label="Number of iterations to complete"/>
79 <param name="clustering_flag" type="select" display="radio" label="Do clustering?">
80 <option value="yes">Yes</option>
81 <option value="no">No</option>
82 </param>
83 <param name="clustering_method" type="select" label="Clustering method (ignore if you selected 'No' for 'Do clustering?')">
84 <option value="dbscan">DBSCAN</option>
85 <option value="hclust">Hierarchical Clustering</option>
86 </param>
87 <param name="cluster_trimming" type="select" label="Algorithm used to identify and remove cluster outliers (ignore if you selected 'No' for 'Do clustering?')">
88 <option value="sd">Standard Deviations</option>
89 <option value="mcd">Mean Cluster Distance</option>
90 <option value="dbscan_outliers_only">DBSCAN outliers only (Only valid if DBSCAN is selected as 'Algorithm used to find clusters'</option>
91 </param>
92 <param name="sd_cutoff" type="float" value="2" label="Strictness of outlier trimming. Lower = more outliers cut at each stage, Higher = less outliers cut at each stage."/>
93 <!-- Control and cases tag info -->
94 <param name="control_tag" type="text" value="LP" label="Control Tag"/>
95 <param name="cases_tag" type="text" value="HAPS" label="Cases Tag"/>
96 <param name="user_configfile" type="data" format="txt" optional="true" label="Optional user provided config file.
97 NB:
98 - If this is set, ALL the fields below will be ignored.
99 - If no input is provided, and the input data is a text file containing variant data, ALL the fields below except the filters MUST be filled in"/>
100 <param name="ethnicity_file" type="data" format="txt" optional="true" label="Optional file containing data about ethnicity of samples"/>
101 <param name="xsamples_file" type="data" format="txt" optional="true" label="Optional file containing EXACT ids of samples to exclude"/>
102 <param name="xsnps_file" type="data" format="txt" optional="true" label="Optional file containing EXACT ids of SNPs to exclude"/>
103 <!-- Column headers -->
104 <param name="sample_id_column" type="text" value="#Sample" label="Sample ID Column"/>
105 <param name="variant_id_column" type="text" value="ID" label="Variant ID Column"/>
106 <param name="chromosome_column" type="text" value="CHROM" label="Chromosome Column"/>
107 <param name="position_column" type="text" value="POS" label="Position Column"/>
108 <param name="genotype_column" type="text" value="GT" label="Genotype Column"/>
109 <param name="reference_column" type="text" value="REF" label="Reference Allele Column"/>
110 <param name="alternate_column" type="text" value="ALT" label="Alternate Allele Column"/>
111 <!-- Numeric Filters -->
112 <repeat name="numeric_filters" title="Optional Numeric Filters">
113 <param name="filter_name" type="text" label="Filter Name"/>
114 <param name="column_name" type="text" label="Name of column to filter on"/>
115 <param name="operation" type="select" label="Accept if column value is:">
116 <option value="g">greater than</option>
117 <option value="l">less than</option>
118 <option value="e">equal to</option>
119 <option value="ge">greater than or equal to</option>
120 <option value="le">less than or equal to</option>
121 </param>
122 <param name="cutoff" type="float" value="0" label="Cutoff Value"/>
123 </repeat>
124 <!-- String Filters -->
125 <repeat name="string_filters" title="Optional String Filters">
126 <param name="filter_name" type="text" label="Filter Name"/>
127 <param name="column_name" type="text" label="Name of column to filter on"/>
128 <param name="exact_flag" type="select" label="Exact pattern matching?">
129 <option value="exact">Yes</option>
130 <option value="not_exact">No</option>
131 </param>
132 <param name="accept_flag" type="select" label="Action to perform after a successful match">
133 <option value="accept">Accept</option>
134 <option value="reject">Reject</option>
135 </param>
136 <param name="patterns" type="text" area="true" size="10x35" label="Patterns to match on" help="Enter a list of patterns here, separated by newlines"/>
137 </repeat>
138 </inputs>
139 <outputs>
140 <data name="output" format="html">
141 <label>PCA summary: "${datafile.name}"</label>
142 </data>
143 </outputs>
144 <help><![CDATA[
145 .. class:: warningmark
146
147 '''WARNING''' This tool requires the 'dbscan' (https://cran.r-project.org/web/packages/dbscan/index.html) and 'flashpcaR' (https://github.com/gabraham/flashpca/releases) R packages to be installed on the galaxy instance.
148
149 ======================================
150 Principle Component Analysis Pipeline
151 ======================================
152
153 Overview
154 --------
155 A tool which performs iterative principle component analysis.
156 The general idea is to seperate patient samples based on their ethnicity, by performing PCA on the variant data of each sample.
157 After each analysis step, outliers are identified. The PCA is then repeated, with the outliers removed.
158 This process continues for a set number of iterations specified by the user. After the pipeline completes, the user can see a
159 detailed summary, as well as have access to the outliers identified at each iteration.
160
161 Primary Input
162 ---------------
163 As primary input the tools accepts a single file, which may be formatted in the following ways:
164
165 - **Variant data file:** This should be a tab-delimited text file, with each row containing data about a single variant site from a single person. If this option is selected, the column names which contain important information must also be specified, either via a configuration file (see below), or through the tool's form fields.
166 - **Numeric ped file:** See http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml for detailed information on PED format. This tool requires the affection status of each site to be specified numerically i.e.:
167
168 - 0 = homozygous reference
169 - 1 = heterozygous
170 - 2 = homozygous alternate
171
172 rather than consisting of pairs of genotypes for each site.
173 - **RData file:** File containing stored data from an R session. For this tool the input must meet certain requirements:
174
175 - The file can only contain a SINGLE R object, which must be a list.
176 - The list must contain a named 'bed' element.
177 - The 'bed' element must be an n x m matrix/data frame, where n = number of samples, m = number of unique snps found in all the samples.
178 - The A(i,j)th entry in the 'bed' matrix should indicate affectation status of the ith sample at the jth SNP site, according to the key for numeric ped files (as above).
179 - The row names of the 'bed' matrix must contain the ids of the samples.
180 - The column names of the 'bed' matrix must contain the ids of the SNPs.
181
182 If these very specific criteria are not met, the tool WILL fail.
183
184
185
186 Primary Output
187 ---------------
188
189 HTML file containing plots of the PCA for each iteration.
190 Possible plots, depending on user specified options:
191
192 - **Control vs Cases Plot:** If control and/or cases tags are provided, this plot will be output. ALL samples are plotted, with controls shown in blue, cases in red, unknown samples in black.
193 - **Cluster Plot:** Output if user opts to do clustering. Samples are plotted, with clusters colour-coded. Outliers as identified by DBSCAN are always read and use an open circle as the icon. Trimmed clusters use a cross for the icon, instead of a circle. Both the outliers (open circles) AND the rejected clusters (crosses) will be dropped in the next iteration.
194 - **Outliers Plot:** Output if user does NOT opt to do clustering. Samples which are considered outliers (as described above in 'Detecting outliers without clustering') are plotted as red open circles; all other samples are plotted as green full circles.
195 - **Standard Deviations Plot:** Samples are colour-coded by standard deviation. Samples which fall within 1 standard devaiton of the median are red, <= 2 sds are green, <= 3 sds are blue, > 3 sds are purple.
196 - **Ethnicity Plot:** Each ethnicity uses a specific colour and symbol. Fairly self-explanotory. Plot is only output if an ethnicity data file is provided as input.
197
198 Beneath the plots there are also two expandable lists. Samples excluded shows which samples were not part of the PCA for this iteration. This is cumulative. Outliers shows the outliers detected in THIS iteration. Any available data from the ethnicity file (if provided) is also displayed for each excluded sample.
199
200 Options/Secondary Inputs
201 ------------------------
202 - **Type of input data file:** Either a ped file or a text file as specified above
203 - **Number of iterations to complete:** A single iteration would involve performing PCA on the input data, then identifying and removing outliers. Two iterations would involve performing PCA again with the outliers identified from the first iteration excluded, three iterations would exclude the outliers from the first 2 stages, and so on and so forth.
204 - **Detecting outliers without clustering:** This is done by obtaining the standard deviations of the first two principle components. Any samples whose scores for either of these first two components falls more than 'n' number of standard deviations away from the component median are considered outliers.
205 - **Clustering:** The user may select from a range of algorithms which will try to identify clusters in the data, with each cluster hopefully corresponding to an ethnic group.
206 - **Clustering methods:**
207
208 - *DBSCAN (Density based spatial clustering of applications with noise):*
209
210 Forms clusters based on density of points, and does not require the number of clusters to be specified beforehand. Good for irregularly shaped, non-spherical clusters. Does NOT require all points to be part of clusters, and produces a set of 'outliers', i.e. points which do not belong to any clusters.
211
212 - *Hierarchical Clustering:*
213
214 Forms clusters based on distance between points. Tends to result in spherical clusters, but able to handle clusters of varying density. Forces all points to be part of a single cluster. The number of clusters is determined seperately, using the silhouette scores of all the points as a heuristic.
215
216 - **Cluster trimming methods:** All these methods first involve finding the centres of each cluster.
217
218 - *Standard Deviations:*
219
220 If the centroid of a cluster lies more than ‘n’ standard deviations (n is passed in as a parameter by the user) from the centroid of the entire dataset in either the x or y directions, the entire cluster is cut. If DBSCAN is selected, the outliers it identifies are also cut.
221
222 - *Mean Cluster Distance:*
223
224 Obtain the average distance between clusters, done by computing the distance between all pairs of clusters and taking the mean. For each cluster, we also compute an average “isolation” value, which is the mean of the distances between that particular cluster and all other clusters. If a cluster’s isolation value is larger than the average cluster distance (multiplied by the strictness weighting), then that cluster is considered an outlier and cut from the next iteration. If DBSCAN is selected, the outliers it identifies are also cut.
225
226 - *DBSCAN outliers only:*
227
228 Only cut the points identified by the DBSCAN algorithm as not belonging to any cluster. No entire clusters are cut. Obviously this method is only applicable if DBSCAN is selected as the clustering method. THE TOOL WILL NOT RUN IF YOU SELECT THIS OPTION TOGETHER WITH 'Hierarchical Clustering' AS THE CLUSTERING METHOD.
229
230 - **Strictness:** A multiplier used to determine how 'strict' the outlier cutting methods are. For example, if strictness = 1, and we are not doing clustering, all points which lie more than 1 sd from the median are cut. If strictness = 2, all points which lie more than 2 sd from the median are cut, etc.
231
232 - **Control Tag:** A pattern present in the ids of all the control samples, e.g. "LP"
233
234 - **Cases Tag:** A pattern present in the ids of all the cases samples, e.g. "HAPS"
235
236 - **Configuration file:** A configuration file to accompany an input variant text file. The config file has a rather specific format, an example is given below::
237
238 #control
239 control_tag,#Sample,HAPS
240 cases_tag,#Sample,LP
241 #column_names
242 genotype_column,GT
243 reference_column,REF
244 alternate_column,ALT
245 sample_id_column,#Sample
246 chromosome_column,CHROM
247 position_column,POS
248 variant_id_column,ID
249 #numeric_filters
250 strand_bias_filter,Fraction_with_strand_bias,<,0.03
251 position_bias_filter,Fraction_with_positional_bias,<,0.03
252 count_filter,Num_samples_variant,>,1
253 pass_filter,Fraction_samples_passed_filter,>,0.9
254 #string_filters
255 variant_type_filter,Variant_Type,exact,accept
256 SNV
257 genotype_filter,GT,exact,accept
258 '0/1,'1/1
259
260 File consists of up to four sections, the starts of which are marked by lines beginning with an octothorpe.
261
262 - *'#control' section:* Indicates substrings found in ids of controls and cases
263 - *'#column_names' section:* This is the only required section. First column indicates what column name (in the variant text file) the second column specifies. The same keys i.e. left most column values, as shown in the example must be used, e.g. sample_id_column, the RHS column names must match the names in the variant data file. If using a generated config file, only modify the RHS column, and DO NOT REMOVE ANY rows from this section.
264 - *'#numeric_filters' section:* Each filter takes up a single line, and is seperated into 4 sections by commas.
265
266 - Column 1: Name of the filter, which is arbitrary
267 - Column 2: The name of the column in the variant file to filter on. If this column is not found, a warning is displayed
268 - Column 3: The criteria of the filter which must be passed in order for us to accept a particular row. E.g. less than, greater than
269 - Column 4: The cutoff value to be compared against.
270
271 - *'#string_filters' section:* Each filter takes up two lines.
272
273 - Line 1, Column 1: Arbitrary filter name
274 - Line 1, Column 2: Column name to filter on
275 - Line 1, Column 3: Do the patterns have to be exact matches, or just a substrings? E.g. if pattern = "HAPS" and string being compared = "HAPS-909090", if exact was true this would not be a successfull match, whereas if not_exact was true it would be a match.
276 - Line 1, Column 4: What to do with the row if a successful match is detected, e.g. accept or reject
277 - Line 2: A comma seperated list of patterns to match on
278
279
280 - **Ethnicity file:** An ethnicity file containing ethnicity data, and possible other data, on the samples. Note this data is not used to sort the input and has no effect on the PCA itself. It is used only to label the results of the output.
281
282 Requirements:
283
284 - tab delimited
285 - Must have at least two columns
286 - First column has sample ID's
287 - Second column has ethnicities
288 - First row must be a header
289
290 First few lines of a correctly formatted ethnicity file given below::
291
292 IID population Halo1.or.2. BloodAge SalivaAge COB ethnicity
293 LP-10000001 AUSTRALIAN Halo2 - LP-BC 67 NA Australia australian
294 LP-10000003 AUSTRALIAN Halo1 45 NA Australia australian southern_european
295 LP-10000005 AUSTRALIAN Halo1 73 NA Australia australian southern_european
296 LP-10000008 EUROPE Halo1 54 NA South Eastern Europe south_east_european
297 LP-10000009 OTHER Halo1 65 NA Southern & East Africa jewish
298
299 - **Exclude samples file:** A text file containing exact ids of samples to exclude from the PCA.
300
301 Requirements:
302
303 - single column
304 - sample ids seperated by newlines
305 - one sample id per line
306
307 Example::
308
309 HAPS-90573
310 HAPS-90578R
311 HAPS-110542
312 HAPS-110605
313 HAPS-110620
314 HAPS-110638
315 HAPS-110649
316 HAPS-110668
317 HAPS-110799
318 HAPS-110813
319 HAPS-110959
320 HAPS-111186
321 HAPS-111298
322 HAPS-111404
323 HAPS-111493
324 HAPS-111512
325 HAPS-111538
326
327 - **Exclude SNPS file:** A text file containing exact ids of SNPs to exclude from the PCA.
328
329 Requirements:
330
331 - single column
332 - snp ids seperated by newlines
333 - one snp id per line
334
335 Example::
336
337 rs72896283
338 rs7534447
339 rs4662775
340 rs10932813
341 rs10932816
342 rs12330369
343 rs1802904
344 rs10902762
345 rs9996817
346 rs6446393
347 rs871133
348 rs4301095
349 rs941849
350 rs6917467
351 rs75834296
352 rs142922667
353
354 - **Required Column Headers:** If a variant text file is the primary input, the following information MUST be provided, either through the config file, or by filling out the corresponding fields in the tool submission form.
355
356 - Sample IDs: Name of the column containing the sample ids
357 - Chromosome: Name of the column indicating what chromosome the SNP is found on
358 - Position: Name of the column indicating at which position on the chromosome the SNP is found
359 - Genotype: The genotype of the sample for this site
360 - Reference: The 'normal'/'common' genotype for this site
361 - Alternate: The alternate genotype for this site
362 - Variant IDs: Name of the column indicating the ID of the SNP
363
364 - **Numeric Filters:** See Configuration file section
365 - **String Filters:** See Configuration file section
366
367 Other Output
368 -------------
369
370 - Tool will output a root folder containing the HTML file and all the plots, placed in directories seperated by iteration.
371 - If the input data was a variant file, the output folder will also contain a numeric ped file, generated before the first iteration, as well as a config file. The config file is either the exact one passed in by the user, or one automatically generated from the form input, which can be used for future PCA runs.
372
373 ]]>
374
375
376 </help>
377 </tool>