Mercurial > repos > luis > ball
comparison galaxy_stubs/FingerprintSimilarityClustering.xml @ 2:605370bc1def draft default tip
Uploaded
author | luis |
---|---|
date | Tue, 12 Jul 2016 12:33:33 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
1:31013b5cd066 | 2:605370bc1def |
---|---|
1 <?xml version='1.0' encoding='UTF-8'?> | |
2 <!--This is a configuration file for the integration of a tools into Galaxy (https://galaxyproject.org/). This file was automatically generated using CTD2Galaxy.--> | |
3 <!--Proposed Tool Section: [Chemoinformatics]--> | |
4 <tool id="FingerprintSimilarityClustering" name="FingerprintSimilarityClustering" version="1.1.0"> | |
5 <description>fast clustering of compounds using 2D binary fingerprints</description> | |
6 <macros> | |
7 <token name="@EXECUTABLE@">FingerprintSimilarityClustering</token> | |
8 <import>macros.xml</import> | |
9 </macros> | |
10 <expand macro="stdio"/> | |
11 <expand macro="requirements"/> | |
12 <command>FingerprintSimilarityClustering | |
13 | |
14 #if $param_t: | |
15 -t $param_t | |
16 #end if | |
17 #if $param_f: | |
18 -f $param_f | |
19 #end if | |
20 #if $param_fp_col: | |
21 -fp_col $param_fp_col | |
22 #end if | |
23 #if $param_id_col: | |
24 -id_col $param_id_col | |
25 #end if | |
26 #if $param_fp_tag: | |
27 -fp_tag "$param_fp_tag" | |
28 #end if | |
29 #if $param_id_tag: | |
30 -id_tag "$param_id_tag" | |
31 #end if | |
32 #if $param_tc: | |
33 -tc $param_tc | |
34 #end if | |
35 #if $param_cc: | |
36 -cc $param_cc | |
37 #end if | |
38 #if $param_l: | |
39 -l $param_l | |
40 #end if | |
41 #if $param_nt: | |
42 -nt "$param_nt" | |
43 #end if | |
44 #if $param_sdf_out: | |
45 -sdf_out $param_sdf_out | |
46 #end if | |
47 </command> | |
48 <inputs> | |
49 <param name="param_t" type="data" format="smi.gz,csv,sdf.gz,sdf,txt.gz,smi,txt,csv.gz" optional="False" value="<class 'CTDopts.CTDopts._Null'>" label="Target library input file" help="(-t) "/> | |
50 <param name="param_f" type="integer" min="1" max="2" optional="False" value="0" label="Fingerprint format [1 = binary bitstring, 2 = comma separated feature list]" help="(-f) "/> | |
51 <param name="param_fp_col" type="integer" value="-1" label="Column number for comma separated smiles input which contains the fingerprint" help="(-fp_col) "/> | |
52 <param name="param_id_col" type="integer" value="-1" label="Column number for comma separated smiles input which contains the molecule identifie" help="(-id_col) "/> | |
53 <param name="param_fp_tag" type="text" size="30" value=" " label="Tag name for SDF input which contains the fingerprint" help="(-fp_tag) "> | |
54 <sanitizer> | |
55 <valid initial="string.printable"> | |
56 <remove value="'"/> | |
57 <remove value="""/> | |
58 </valid> | |
59 </sanitizer> | |
60 </param> | |
61 <param name="param_id_tag" type="text" size="30" value=" " label="Tag name for SDF input which contains the molecule identifie" help="(-id_tag) "> | |
62 <sanitizer> | |
63 <valid initial="string.printable"> | |
64 <remove value="'"/> | |
65 <remove value="""/> | |
66 </valid> | |
67 </sanitizer> | |
68 </param> | |
69 <param name="param_tc" type="float" value="0.7" label="Tanimoto cutoff [default: 0.7]" help="(-tc) "/> | |
70 <param name="param_cc" type="integer" value="1000" label="Clustering size cutoff [default: 1000]" help="(-cc) "/> | |
71 <param name="param_l" type="integer" value="0" label="Number of fingerprints to read" help="(-l) "/> | |
72 <param name="param_nt" type="text" size="30" value="1" label="Number of parallel threads to use" help="(-nt) To use all possible threads enter <max> [default: 1]"> | |
73 <sanitizer> | |
74 <valid initial="string.printable"> | |
75 <remove value="'"/> | |
76 <remove value="""/> | |
77 </valid> | |
78 </sanitizer> | |
79 </param> | |
80 <param name="param_sdf_out" type="integer" min="0" max="1" optional="True" value="0" label="If input file has SD format, this flag activates writing of clustering information as new tags in a copy of the input SD file" help="(-sdf_out) "/> | |
81 </inputs> | |
82 <expand macro="advanced_options"/> | |
83 <outputs> | |
84 <data name="param_stdout" format="text" label="Output from stdout"/> | |
85 </outputs> | |
86 <help>This tool performs a fast and deterministic semi-hierarchical clustering of input compounds encoded as 2D binary fingerprints. | |
87 | |
88 The method is a multistep workflow which first reduces the number of input fingerprints by removing duplicates. This unique set is forwarded to connected | |
89 components decomposition by calculating all pairwise Tanimoto similarities and application of a similarity cutoff value. As a third step, all connected components | |
90 which exceed a predefined size are hierarchically clustered using the average linkage clustering criterion. The Kelley method is applied on every hierarchical clustering | |
91 to determine a level for cluster selection. Finally, the fingerprint duplicates are remapped onto the final clusters which contain their representatives. | |
92 | |
93 For every final cluster a medoid is calulated. For a single cluster multiple medoids are possible because fingerprint duplicates of a medoid are also marked as medoid. | |
94 | |
95 For every compound the output yields a cluster ID, a medoid tag where '1' indicates the cluster medoid(s) and the average similarity of the compound to all other | |
96 cluster members. If the output format is SD, these properties are added as new tags. | |
97 | |
98 ====================================================================================================================================================== | |
99 | |
100 Examples: | |
101 | |
102 $ FingerprintSimilarityClustering -t target.sdf -fp_tag FPRINT -f 1 -id_tag NAME | |
103 tries to read fingerprints as binary bitstrings (-f 1) from tag <FPRINT> and compound IDs from tag <NAME> of target.sdf input file. | |
104 The clustering workflow described is executed on the input molecules with default values. | |
105 | |
106 $ FingerprintSimilarityClustering -t target.csv -fp_col 3 -f 2 -id_col 1 | |
107 tries to read fingerprints as comma separated integer feature list (-f 2) from column 3 and IDs from column 1 out of a space separated CSV file. | |
108 The clustering workflow described is executed on the input molecules with default values. | |
109 | |
110 $ FingerprintSimilarityClustering -t target.sdf -fp_tag FPRINT -f 1 -id_tag NAME -nt max | |
111 Same as first example but executed in parallel mode using as many threads as available. | |
112 | |
113 $ FingerprintSimilarityClustering -t target.sdf -fp_tag FPRINT -f 1 -id_tag NAME -tc 0.5 -cc 50 | |
114 Same as first example but using modified parameters for similarity network generation (tc 0.5) and size of connected components to be clustered (-cc 50). | |
115 | |
116 </help> | |
117 </tool> |