comparison galaxy_stubs/FingerprintSimilarityClustering.xml @ 2:605370bc1def draft default tip

Uploaded
author luis
date Tue, 12 Jul 2016 12:33:33 -0400
parents
children
comparison
equal deleted inserted replaced
1:31013b5cd066 2:605370bc1def
1 <?xml version='1.0' encoding='UTF-8'?>
2 <!--This is a configuration file for the integration of a tools into Galaxy (https://galaxyproject.org/). This file was automatically generated using CTD2Galaxy.-->
3 <!--Proposed Tool Section: [Chemoinformatics]-->
4 <tool id="FingerprintSimilarityClustering" name="FingerprintSimilarityClustering" version="1.1.0">
5 <description>fast clustering of compounds using 2D binary fingerprints</description>
6 <macros>
7 <token name="@EXECUTABLE@">FingerprintSimilarityClustering</token>
8 <import>macros.xml</import>
9 </macros>
10 <expand macro="stdio"/>
11 <expand macro="requirements"/>
12 <command>FingerprintSimilarityClustering
13
14 #if $param_t:
15 -t $param_t
16 #end if
17 #if $param_f:
18 -f $param_f
19 #end if
20 #if $param_fp_col:
21 -fp_col $param_fp_col
22 #end if
23 #if $param_id_col:
24 -id_col $param_id_col
25 #end if
26 #if $param_fp_tag:
27 -fp_tag "$param_fp_tag"
28 #end if
29 #if $param_id_tag:
30 -id_tag "$param_id_tag"
31 #end if
32 #if $param_tc:
33 -tc $param_tc
34 #end if
35 #if $param_cc:
36 -cc $param_cc
37 #end if
38 #if $param_l:
39 -l $param_l
40 #end if
41 #if $param_nt:
42 -nt "$param_nt"
43 #end if
44 #if $param_sdf_out:
45 -sdf_out $param_sdf_out
46 #end if
47 </command>
48 <inputs>
49 <param name="param_t" type="data" format="smi.gz,csv,sdf.gz,sdf,txt.gz,smi,txt,csv.gz" optional="False" value="&lt;class 'CTDopts.CTDopts._Null'&gt;" label="Target library input file" help="(-t) "/>
50 <param name="param_f" type="integer" min="1" max="2" optional="False" value="0" label="Fingerprint format [1 = binary bitstring, 2 = comma separated feature list]" help="(-f) "/>
51 <param name="param_fp_col" type="integer" value="-1" label="Column number for comma separated smiles input which contains the fingerprint" help="(-fp_col) "/>
52 <param name="param_id_col" type="integer" value="-1" label="Column number for comma separated smiles input which contains the molecule identifie" help="(-id_col) "/>
53 <param name="param_fp_tag" type="text" size="30" value=" " label="Tag name for SDF input which contains the fingerprint" help="(-fp_tag) ">
54 <sanitizer>
55 <valid initial="string.printable">
56 <remove value="'"/>
57 <remove value="&quot;"/>
58 </valid>
59 </sanitizer>
60 </param>
61 <param name="param_id_tag" type="text" size="30" value=" " label="Tag name for SDF input which contains the molecule identifie" help="(-id_tag) ">
62 <sanitizer>
63 <valid initial="string.printable">
64 <remove value="'"/>
65 <remove value="&quot;"/>
66 </valid>
67 </sanitizer>
68 </param>
69 <param name="param_tc" type="float" value="0.7" label="Tanimoto cutoff [default: 0.7]" help="(-tc) "/>
70 <param name="param_cc" type="integer" value="1000" label="Clustering size cutoff [default: 1000]" help="(-cc) "/>
71 <param name="param_l" type="integer" value="0" label="Number of fingerprints to read" help="(-l) "/>
72 <param name="param_nt" type="text" size="30" value="1" label="Number of parallel threads to use" help="(-nt) To use all possible threads enter &lt;max&gt; [default: 1]">
73 <sanitizer>
74 <valid initial="string.printable">
75 <remove value="'"/>
76 <remove value="&quot;"/>
77 </valid>
78 </sanitizer>
79 </param>
80 <param name="param_sdf_out" type="integer" min="0" max="1" optional="True" value="0" label="If input file has SD format, this flag activates writing of clustering information as new tags in a copy of the input SD file" help="(-sdf_out) "/>
81 </inputs>
82 <expand macro="advanced_options"/>
83 <outputs>
84 <data name="param_stdout" format="text" label="Output from stdout"/>
85 </outputs>
86 <help>This tool performs a fast and deterministic semi-hierarchical clustering of input compounds encoded as 2D binary fingerprints.
87
88 The method is a multistep workflow which first reduces the number of input fingerprints by removing duplicates. This unique set is forwarded to connected
89 components decomposition by calculating all pairwise Tanimoto similarities and application of a similarity cutoff value. As a third step, all connected components
90 which exceed a predefined size are hierarchically clustered using the average linkage clustering criterion. The Kelley method is applied on every hierarchical clustering
91 to determine a level for cluster selection. Finally, the fingerprint duplicates are remapped onto the final clusters which contain their representatives.
92
93 For every final cluster a medoid is calulated. For a single cluster multiple medoids are possible because fingerprint duplicates of a medoid are also marked as medoid.
94
95 For every compound the output yields a cluster ID, a medoid tag where '1' indicates the cluster medoid(s) and the average similarity of the compound to all other
96 cluster members. If the output format is SD, these properties are added as new tags.
97
98 ======================================================================================================================================================
99
100 Examples:
101
102 $ FingerprintSimilarityClustering -t target.sdf -fp_tag FPRINT -f 1 -id_tag NAME
103 tries to read fingerprints as binary bitstrings (-f 1) from tag &lt;FPRINT&gt; and compound IDs from tag &lt;NAME&gt; of target.sdf input file.
104 The clustering workflow described is executed on the input molecules with default values.
105
106 $ FingerprintSimilarityClustering -t target.csv -fp_col 3 -f 2 -id_col 1
107 tries to read fingerprints as comma separated integer feature list (-f 2) from column 3 and IDs from column 1 out of a space separated CSV file.
108 The clustering workflow described is executed on the input molecules with default values.
109
110 $ FingerprintSimilarityClustering -t target.sdf -fp_tag FPRINT -f 1 -id_tag NAME -nt max
111 Same as first example but executed in parallel mode using as many threads as available.
112
113 $ FingerprintSimilarityClustering -t target.sdf -fp_tag FPRINT -f 1 -id_tag NAME -tc 0.5 -cc 50
114 Same as first example but using modified parameters for similarity network generation (tc 0.5) and size of connected components to be clustered (-cc 50).
115
116 </help>
117 </tool>