changeset 28:e6831924df01 draft

small fixes (elbow plot and output managment)
author bimib
date Mon, 14 Oct 2019 05:01:08 -0400
parents 8c480c977a12
children 9fcb0e8d6d47
files Marea/marea.py Marea/marea.xml Marea/marea_cluster.py Marea/marea_cluster.xml
diffstat 4 files changed, 89 insertions(+), 29 deletions(-) [+]
line wrap: on
line diff
--- a/Marea/marea.py	Mon Oct 07 13:48:01 2019 -0400
+++ b/Marea/marea.py	Mon Oct 14 05:01:08 2019 -0400
@@ -50,7 +50,7 @@
                         help = 'your tool directory')
     parser.add_argument('-op', '--option', 
                         type = str, 
-                        choices = ['datasets', 'dataset_class'],
+                        choices = ['datasets', 'dataset_class', 'datasets_rasonly'],
                         help='dataset or dataset and class')
     parser.add_argument('-ol', '--out_log', 
                         help = "Output log")    
@@ -86,6 +86,10 @@
                         default = 'true',
                         choices = ['true', 'false'],
                         help = 'generate reaction activity score')
+    parser.add_argument('-sr', '--single_ras_file',  
+                         type = str,              
+                         help = 'file that will contain ras')
+    					
     args = parser.parse_args()
     return args
 
@@ -658,7 +662,7 @@
 
 ############################ create_ras #######################################
 
-def create_ras (resolve_rules, dataset_name):
+def create_ras (resolve_rules, dataset_name, single_ras):
 
     if resolve_rules == None:
         warning("Couldn't generate RAS for current dataset: " + dataset_name)
@@ -670,8 +674,13 @@
                 
     output_ras = pd.DataFrame.from_dict(resolve_rules)
     output_to_csv = pd.DataFrame.to_csv(output_ras, sep = '\t', index = False)
-                
-    text_file = open("ras/Reaction_Activity_Score_Of_" + dataset_name + ".tsv", "w")
+    
+    if (single_ras):
+        args = process_args(sys.argv)
+        text_file = open(args.single_ras_file, "w")
+    else:
+        text_file = open("ras/Reaction_Activity_Score_Of_" + dataset_name + ".tsv", "w")
+    
     text_file.write(output_to_csv)
     text_file.close()
 
@@ -749,7 +758,34 @@
     
     class_pat = {}
     
-    if args.option == 'datasets':
+    if args.option == 'datasets_rasonly':
+        name = "RAS Dataset"
+        dataset = read_dataset(args.input_datas[0],"dataset")
+
+        dataset.iloc[:, 0] = (dataset.iloc[:, 0]).astype(str)
+
+        type_gene = gene_type(dataset.iloc[0, 0], name) 
+            
+        if args.rules_selector != 'Custom':
+            genes = data_gene(dataset, type_gene, name, None)
+            ids, rules = load_id_rules(recon.get(type_gene))
+        elif args.rules_selector == 'Custom':
+            genes = data_gene(dataset, type_gene, name, gene_in_rule)
+                
+        resolve_rules, err = resolve(genes, rules, ids, resolve_none, name)
+
+        create_ras(resolve_rules, name, True)
+          
+        if err != None and err:
+            warning('Warning: gene\n' + str(err) + '\nnot found in class '
+                + name + ', the expression level for this gene ' +
+                'will be considered NaN\n')
+        
+        print('execution succeded')
+        return None
+    
+    
+    elif args.option == 'datasets':
         num = 1
         for i, j in zip(args.input_datas, args.names):
 
@@ -769,8 +805,7 @@
             resolve_rules, err = resolve(genes, rules, ids, resolve_none, name)
 
             if generate_ras:
-                create_ras(resolve_rules, name)
-                
+                create_ras(resolve_rules, name, False)
             
             if err != None and err:
                 warning('Warning: gene\n' + str(err) + '\nnot found in class '
@@ -801,7 +836,8 @@
                     'will be considered NaN\n')
         if resolve_rules != None:
             class_pat = split_class(classes, resolve_rules)
-            
+    
+    	
     if args.rules_selector == 'Custom':
         if args.yes_no == 'yes':
             try:
--- a/Marea/marea.xml	Mon Oct 07 13:48:01 2019 -0400
+++ b/Marea/marea.xml	Mon Oct 14 05:01:08 2019 -0400
@@ -54,6 +54,10 @@
             --input_data ${input_data}
             --input_class ${input_class}
         #end if
+        #if $cond.type_selector == 'datasets_rasonly':
+            --input_datas ${input_Datasets}
+            --single_ras_file $ras_single
+        #end if
         ]]>
     </command>
 
@@ -83,6 +87,7 @@
             <param name="type_selector" argument="--option" type="select" label="Input format:">
                 <option value="datasets" selected="true">RNAseq of group 1 + RNAseq of group 2 + ... + RNAseq of group N</option>
                 <option value="dataset_class">RNAseq of all samples + sample group specification</option>
+                <option value="datasets_rasonly" selected="true">RNAseq dataset</option>
             </param>
             <when value="datasets">
                 <repeat name="input_Datasets" title="RNAseq" min="2">
@@ -90,12 +95,18 @@
                     <param name="input_name" argument="--names" type="text" label="Dataset's name:" value="Dataset" help="Default: Dataset" />
                 </repeat>
             </when>
+            <when value="datasets_rasonly">
+                <param name="input_Datasets" argument="--input_datas" type="data" format="tabular, csv, tsv" label="add dataset" />
+            </when>
             <when value="dataset_class">
                 <param name="input_data" argument="--input_data" type="data" format="tabular, csv, tsv" label="RNAseq of all samples" />
                 <param name="input_class" argument="--input_class" type="data" format="tabular, csv, tsv" label="Sample group specification" />
             </when>
         </conditional>
        
+       
+       <!--TODO: NASCONDERE ADVANCED SE RAS ONLY-->
+       
 	<conditional name="advanced">
 		<param name="choice" type="boolean" checked="false" label="Use advanced options?" help="Use this options to choose custom rules for evaluation: pValue, Fold-Change threshold, how to solve (A and NaN) and specify output maps.">
 		    <option value="true" selected="true">No</option>
@@ -116,13 +127,18 @@
 
     <outputs>
         <data format="txt" name="log" label="${tool.name} - Log" />
+        <data format="tabular" name="ras_single" label="${tool.name} - RAS">
+        	<filter>cond['type_selector'] == "datasets_rasonly"</filter>
+        </data>
         <collection name="results" type="list" label="${tool.name} - Results">
+        <filter>cond['type_selector'] == "datasets" or cond['type_selector'] == "dataset_class"</filter>
             <discover_datasets pattern="__name_and_ext__" directory="result"/>
         </collection>
-	<collection name="ras" type="list" label="${tool.name} - RAS" format_source="tabular">
+	<collection name="ras" type="list" label="${tool.name} - RAS list" format_source="tabular">
 	    <filter>advanced['choice'] and advanced['generateRas']</filter>
     	    <discover_datasets pattern="__name_and_ext__" directory="ras" format="tabular"/>
 	</collection>
+	
     </outputs>
     <tests>
         <test>
--- a/Marea/marea_cluster.py	Mon Oct 07 13:48:01 2019 -0400
+++ b/Marea/marea_cluster.py	Mon Oct 14 05:01:08 2019 -0400
@@ -78,6 +78,11 @@
     parser.add_argument('-ep', '--eps',
                         type = int,
                         help = 'eps for dbscan (optional)')
+                        
+    parser.add_argument('-bc', '--best_cluster',
+                        type = str,
+                        help = 'output of best cluster tsv')
+    				
     
     
     args = parser.parse_args()
@@ -131,20 +136,7 @@
     dest = name
     classe.to_csv(dest, sep = '\t', index = False,
                       header = ['Patient_ID', 'Class'])
-    
-
-      #list_labels = labels
-    #list_values = dataset
-
-    #list_values = list_values.tolist()
-    #d = {'Label' : list_labels, 'Value' : list_values}
-    
-    #df = pd.DataFrame(d, columns=['Value','Label'])
-
-    #dest = name + '.tsv'
-    #df.to_csv(dest, sep = '\t', index = False,
-     #                 header = ['Value', 'Label'])
-    
+   
 ########################### trova il massimo in lista ########################
 def max_index (lista):
     best = -1
@@ -158,7 +150,7 @@
     
 ################################ kmeans #####################################
     
-def kmeans (k_min, k_max, dataset, elbow, silhouette, davies):
+def kmeans (k_min, k_max, dataset, elbow, silhouette, davies, best_cluster):
     if not os.path.exists('clustering'):
         os.makedirs('clustering')
     
@@ -189,7 +181,10 @@
         cluster_labels = clusterer.fit_predict(dataset)
         
         all_labels.append(cluster_labels)
-        silhouette_avg = silhouette_score(dataset, cluster_labels)
+        if n_clusters == 1:
+        	silhouette_avg = 0
+        else:
+            silhouette_avg = silhouette_score(dataset, cluster_labels)
         scores.append(silhouette_avg)
         distortions.append(clusterer.fit(dataset).inertia_)
         
@@ -201,6 +196,14 @@
             prefix = '_BEST'
             
         write_to_csv(dataset, all_labels[i], 'clustering/kmeans_with_' + str(i + k_min) + prefix + '_clusters.tsv')
+        
+        
+        if (prefix == '_BEST'):
+            labels = all_labels[i]
+            predict = [x+1 for x in labels]
+            classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str)
+            classe.to_csv(best_cluster, sep = '\t', index = False, header = ['Patient_ID', 'Class'])
+            
             
         if davies:
             with np.errstate(divide='ignore', invalid='ignore'):
@@ -235,6 +238,9 @@
     
 ############################## silhouette plot ###############################
 def silihouette_draw(dataset, labels, n_clusters, path):
+    if n_clusters == 1:
+        return None
+        
     silhouette_avg = silhouette_score(dataset, labels)
     warning("For n_clusters = " + str(n_clusters) +
           " The average silhouette_score is: " + str(silhouette_avg))
@@ -375,7 +381,7 @@
     
     
     if args.cluster_type == 'kmeans':
-        kmeans(args.k_min, args.k_max, X, args.elbow, args.silhouette, args.davies)
+        kmeans(args.k_min, args.k_max, X, args.elbow, args.silhouette, args.davies, args.best_cluster)
     
     if args.cluster_type == 'dbscan':
         dbscan(X, args.eps, args.min_samples)
@@ -386,4 +392,4 @@
 ##############################################################################
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
--- a/Marea/marea_cluster.xml	Mon Oct 07 13:48:01 2019 -0400
+++ b/Marea/marea_cluster.xml	Mon Oct 14 05:01:08 2019 -0400
@@ -17,6 +17,7 @@
         --input $input
       	--tool_dir $__tool_directory__
         --out_log $log
+        --best_cluster $best_cluster
         --cluster_type ${data.clust_type}
         #if $data.clust_type == 'kmeans':
         	--k_min ${data.k_min}
@@ -46,7 +47,7 @@
                 	<option value="hierarchy">Agglomerative Hierarchical</option>
         	</param>
         	<when value="kmeans">
-        		<param name="k_min" argument="--k_min" type="integer" min="2" max="99" value="3" label="Min number of clusters (k) to be tested" />
+        		<param name="k_min" argument="--k_min" type="integer" min="1" max="99" value="3" label="Min number of clusters (k) to be tested" />
         		<param name="k_max" argument="--k_max" type="integer" min="3" max="99" value="5" label="Max number of clusters (k) to be tested" />
         		<param name="elbow" argument="--elbow" type="boolean" value="true" label="Draw the elbow plot from k-min to k-max"/>
         		<param name="silhouette" argument="--silhouette" type="boolean" value="true" label="Draw the Silhouette plot from k-min to k-max"/>
@@ -74,7 +75,8 @@
 
     <outputs>
         <data format="txt" name="log" label="${tool.name} - Log" />
-        <collection name="results" type="list" label="${tool.name} - Results">
+        <data format="tabular" name="best_cluster" label="${tool.name} - Best cluster" />
+        <collection name="results" type="list" label="${tool.name} - Plots and results">
             <discover_datasets pattern="__name_and_ext__" directory="clustering"/>
         </collection>
     </outputs>