changeset 33:abf0bfe01c78 draft

Uploaded
author bimib
date Wed, 16 Oct 2019 16:25:56 -0400
parents b795e3e163e0
children 1a97d1537623
files Marea/marea.xml Marea/marea_cluster.py Marea/marea_cluster.xml
diffstat 3 files changed, 61 insertions(+), 44 deletions(-) [+]
line wrap: on
line diff
--- a/Marea/marea.xml	Wed Oct 16 07:12:37 2019 -0400
+++ b/Marea/marea.xml	Wed Oct 16 16:25:56 2019 -0400
@@ -22,11 +22,11 @@
                 --custom_map $cond_rule.cond_map.Custom_map
             #end if
         #end if
-
+	
       	--tool_dir $__tool_directory__
       	--option $cond.type_selector
-        --out_log $log
-
+        --out_log $log		
+	
         #if $cond.type_selector == 'datasets':
             --input_datas
             #for $data in $cond.input_Datasets:
@@ -43,7 +43,7 @@
 	    	--generate_svg ${cond.advanced.generateSvg}
 	    	--generate_pdf ${cond.advanced.generatePdf}
 	    --generate_ras ${cond.advanced.generateRas}
-	#else
+	#else 
 	    --none true
 	    --pValue 0.05
 	    --fChange 1.5
@@ -61,7 +61,7 @@
 	    --generate_svg ${cond.advanced.generateSvg}
 	    --generate_pdf ${cond.advanced.generatePdf}
 	    --generate_ras ${cond.advanced.generateRas}
-	#else
+	#else 
 	    --none true
 	    --pValue 0.05
 	    --fChange 1.5
@@ -73,7 +73,7 @@
         #if $cond.type_selector == 'datasets_rasonly':
             --input_datas ${input_Datasets}
             --single_ras_file $ras_single
-            --none ${None}
+            --none ${cond.None}
         #end if
         ]]>
     </command>
@@ -108,56 +108,56 @@
             </param>
             <when value="datasets">
                 <repeat name="input_Datasets" title="RNAseq" min="2">
-                    <param name="input" argument="--input_datas" type="data" format="tabular, csv, tsv" label="add dataset" />
+                    <param name="input" argument="--input_datas" type="data" format="tabular, csv, tsv" label="add dataset" />	
                     <param name="input_name" argument="--names" type="text" label="Dataset's name:" value="Dataset" help="Default: Dataset" />
                 </repeat>
                 <conditional name="advanced">
-					<param name="choice" type="boolean" checked="false" label="Use advanced options?" help="Use this options to choose custom rules for evaluation: pValue, Fold-Change threshold, how to solve (A and NaN) and specify output maps.">
+					<param name="choice" type="boolean" checked="false" label="Use advanced options?" help="Use this options to choose custom parameters for evaluation: pValue, Fold-Change threshold, how to solve (A and NaN) and specify output maps.">
 		    			<option value="true" selected="true">No</option>
 		    			<option value="false">Yes</option>
 					</param>
 					<when value="false">
 					</when>
 					<when value="true">
-		    			<param name="None" argument="--none" type="boolean" truevalue="true" falsevalue="false" checked="true" label="(A and NaN) solved as (A)?" />
+		    			<param name="None" argument="--none" type="boolean" truevalue="true" falsevalue="false" checked="true" label="(A and NaN) solved as (A)?" /> 
 		    			<param name="pValue" argument="--pValue" type="float" size="20" value="0.01" max="1" min="0" label="P-value threshold:" help="min value 0" />
 		    			<param name="fChange" argument="--fChange" type="float" size="20" value="1.2" min="1" label="Fold-Change threshold:" help="min value 1" />
 		    			<param name="generateSvg" argument="--generateSvg" type="boolean" checked="false" label="Generate SVG map" help="should the program generate an editable svg map of the processes?" />
-		    			<param name="generatePdf" argument="--generatePdf" type="boolean" checked="true" label="Generate PDF map" help="should the program return a non editable (but displayble) pdf map of the processes?" />
-		    			<param name="generateRas" argument="--generateRas" type="boolean" checked="false" label="Generate Reaction Activity Score for each table" help="Generate Reaction Activity Score for each table" />
+		    			<param name="generatePdf" argument="--generatePdf" type="boolean" checked="true" label="Generate PDF map" help="should the program return a non editable (but displayble) pdf map of the processes?" />	
+		    			<param name="generateRas" argument="--generateRas" type="boolean" checked="false" label="Generate Reaction Activity Score for each table" help="Generate Reaction Activity Score for each table" />		
 					</when>
     	</conditional>
             </when>
             <when value="datasets_rasonly">
                 <param name="input_Datasets" argument="--input_datas" type="data" format="tabular, csv, tsv" label="add dataset" />
                 <param name="input_name" argument="--names" type="text" label="Dataset's name:" value="Dataset" help="Default: Dataset" />
-                <param name="None" argument="--none" type="boolean" truevalue="true" falsevalue="false" checked="true" label="(A and NaN) solved as (A)?" />
+                <param name="None" argument="--none" type="boolean" truevalue="true" falsevalue="false" checked="true" label="(A and NaN) solved as (A)?" /> 
             </when>
             <when value="dataset_class">
                 <param name="input_data" argument="--input_data" type="data" format="tabular, csv, tsv" label="RNAseq of all samples" />
                 <param name="input_class" argument="--input_class" type="data" format="tabular, csv, tsv" label="Sample group specification" />
                 <conditional name="advanced">
-					<param name="choice" type="boolean" checked="false" label="Use advanced options?" help="Use this options to choose custom rules for evaluation: pValue, Fold-Change threshold, how to solve (A and NaN) and specify output maps.">
+					<param name="choice" type="boolean" checked="false" label="Use advanced options?" help="Use this options to choose custom parameters for evaluation: pValue, Fold-Change threshold, how to solve (A and NaN) and specify output maps.">
 		    			<option value="true" selected="true">No</option>
 		    			<option value="false">Yes</option>
 					</param>
 					<when value="false">
 					</when>
 					<when value="true">
-		    			<param name="None" argument="--none" type="boolean" truevalue="true" falsevalue="false" checked="true" label="(A and NaN) solved as (A)?" />
+		    			<param name="None" argument="--none" type="boolean" truevalue="true" falsevalue="false" checked="true" label="(A and NaN) solved as (A)?" /> 
 		    			<param name="pValue" argument="--pValue" type="float" size="20" value="0.01" max="1" min="0" label="P-value threshold:" help="min value 0" />
 		    			<param name="fChange" argument="--fChange" type="float" size="20" value="1.2" min="1" label="Fold-Change threshold:" help="min value 1" />
 		    			<param name="generateSvg" argument="--generateSvg" type="boolean" checked="false" label="Generate SVG map" help="should the program generate an editable svg map of the processes?" />
-		    			<param name="generatePdf" argument="--generatePdf" type="boolean" checked="true" label="Generate PDF map" help="should the program return a non editable (but displayble) pdf map of the processes?" />
-		    			<param name="generateRas" argument="--generateRas" type="boolean" checked="false" label="Generate Reaction Activity Score for each table" help="Generate Reaction Activity Score for each table" />
+		    			<param name="generatePdf" argument="--generatePdf" type="boolean" checked="true" label="Generate PDF map" help="should the program return a non editable (but displayble) pdf map of the processes?" />	
+		    			<param name="generateRas" argument="--generateRas" type="boolean" checked="false" label="Generate Reaction Activity Score for each table" help="Generate Reaction Activity Score for each table" />		
 					</when>
     	</conditional>
             </when>
         </conditional>
-
-
-
-
+       
+      
+       
+	
     </inputs>
 
     <outputs>
@@ -173,7 +173,7 @@
 	    <filter>cond['type_selector'] != "datasets_rasonly" and cond['advanced']['choice'] and cond['advanced']['generateRas']</filter>
     	    <discover_datasets pattern="__name_and_ext__" directory="ras" format="tabular"/>
 	</collection>
-
+	
     </outputs>
     <tests>
         <test>
@@ -189,7 +189,7 @@
 
 This tool analyzes RNA-seq dataset(s) as described in Graudenzi et al."`MaREA`_: Metabolic feature extraction, enrichment and visualization of RNAseq data" bioRxiv (2018): 248724.
 
-Accepted files are:
+Accepted files are: 
     - option 1) two or more RNA-seq datasets, each referring to samples in a given condition/class. The user can specify a label for each class (as e.g. "*classA*" and "*classB*");
     - option 2) one RNA dataset and one class-file specifying the class/condition each sample belongs to.
 
@@ -225,7 +225,7 @@
 
 **"RNAseq of group 1 + RNAseq of group 2 + ... + RNAseq of group N"** option:
 
-RNA-seq Dataset 1:
+RNA-seq Dataset 1:						
 
 @DATASET_EXEMPLE1@
 
@@ -241,14 +241,14 @@
 
 Class-file:
 
-+------------+------------+
-| Patient_ID |    class   |
-+============+============+
-| TCGAAA3529 |     MSI    |
-+------------+------------+
-| TCGAA62671 |     MSS    |
-+------------+------------+
-| TCGAA62672 |     MSI    |
++------------+------------+   
+| Patient_ID |    class   |   
++============+============+   
+| TCGAAA3529 |     MSI    |   
++------------+------------+    
+| TCGAA62671 |     MSS    |    
++------------+------------+    
+| TCGAA62672 |     MSI    |   
 +------------+------------+
 
 |
@@ -271,3 +271,4 @@
     </help>
     <expand macro="citations" />
 </tool>
+	
--- a/Marea/marea_cluster.py	Wed Oct 16 07:12:37 2019 -0400
+++ b/Marea/marea_cluster.py	Wed Oct 16 16:25:56 2019 -0400
@@ -72,11 +72,11 @@
                         help = 'your tool directory')
                         
     parser.add_argument('-ms', '--min_samples',
-                        type = int,
+                        type = float,
                         help = 'min samples for dbscan (optional)')
                         
     parser.add_argument('-ep', '--eps',
-                        type = int,
+                        type = float,
                         help = 'eps for dbscan (optional)')
                         
     parser.add_argument('-bc', '--best_cluster',
@@ -310,7 +310,7 @@
             
 ######################## dbscan ##############################################
     
-def dbscan(dataset, eps, min_samples):
+def dbscan(dataset, eps, min_samples, best_cluster):
     if not os.path.exists('clustering'):
         os.makedirs('clustering')
         
@@ -331,12 +331,15 @@
     
     ##TODO: PLOT SU DBSCAN (no centers) e HIERARCHICAL
     
-    
-    write_to_csv(dataset, labels, 'clustering/dbscan_results.tsv')
+    labels = labels
+    predict = [x+1 for x in labels]
+    classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str)
+    classe.to_csv(best_cluster, sep = '\t', index = False, header = ['Patient_ID', 'Class'])
+  
     
 ########################## hierachical #######################################
     
-def hierachical_agglomerative(dataset, k_min, k_max):
+def hierachical_agglomerative(dataset, k_min, k_max, best_cluster):
 
     if not os.path.exists('clustering'):
         os.makedirs('clustering')
@@ -349,16 +352,28 @@
     
     range_n_clusters = [i for i in range(k_min, k_max+1)]
 
-    for n_clusters in range_n_clusters:
-        
+    scores = []
+    labels = []
+    for n_clusters in range_n_clusters:    
         cluster = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='ward')  
         cluster.fit_predict(dataset)  
         cluster_labels = cluster.labels_
-        
+        labels.append(cluster_labels)
         silhouette_avg = silhouette_score(dataset, cluster_labels)
         write_to_csv(dataset, cluster_labels, 'clustering/hierarchical_with_' + str(n_clusters) + '_clusters.tsv')
+        scores.append(silhouette_avg)
         #warning("For n_clusters =", n_clusters,
               #"The average silhouette_score is :", silhouette_avg)
+              
+    best = max_index(scores) + k_min
+     
+    for i in range(len(labels)):
+        if (i + k_min == best):
+            labels = labels[i]
+            predict = [x+1 for x in labels]
+            classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str)
+            classe.to_csv(best_cluster, sep = '\t', index = False, header = ['Patient_ID', 'Class'])
+     
         
         
        
@@ -390,10 +405,10 @@
         kmeans(args.k_min, args.k_max, X, args.elbow, args.silhouette, args.davies, args.best_cluster)
     
     if args.cluster_type == 'dbscan':
-        dbscan(X, args.eps, args.min_samples)
+        dbscan(X, args.eps, args.min_samples, args.best_cluster)
         
     if args.cluster_type == 'hierarchy':
-        hierachical_agglomerative(X, args.k_min, args.k_max)
+        hierachical_agglomerative(X, args.k_min, args.k_max, args.best_cluster)
         
 ##############################################################################
 
--- a/Marea/marea_cluster.xml	Wed Oct 16 07:12:37 2019 -0400
+++ b/Marea/marea_cluster.xml	Wed Oct 16 16:25:56 2019 -0400
@@ -1,4 +1,4 @@
-<tool id="MaREA_cluester" name="Cluster Analysis" version="1.0.6">
+<tool id="MaREA_cluester" name="Cluster Analysis" version="1.0.7">
     <description></description>
     <macros>
         <import>marea_macros.xml</import>
@@ -75,9 +75,10 @@
 
     <outputs>
         <data format="txt" name="log" label="${tool.name} - Log" />
-        <data format="tabular" name="best_cluster" label="${tool.name} - Best cluster" />
+        <data format="tabular" name="best_cluster" label="${tool.name} - best cluster assignment" />
         <collection name="results" type="list" label="${tool.name} - Plots and results">
             <discover_datasets pattern="__name_and_ext__" directory="clustering"/>
+            <filter>data['clust_type'] == "kmeans" or data['clust_type'] == "hierarchy"</filter>
         </collection>
     </outputs>
     <help>