Mercurial > repos > bimib > marea
changeset 33:abf0bfe01c78 draft
Uploaded
author | bimib |
---|---|
date | Wed, 16 Oct 2019 16:25:56 -0400 |
parents | b795e3e163e0 |
children | 1a97d1537623 |
files | Marea/marea.xml Marea/marea_cluster.py Marea/marea_cluster.xml |
diffstat | 3 files changed, 61 insertions(+), 44 deletions(-) [+] |
line wrap: on
line diff
--- a/Marea/marea.xml Wed Oct 16 07:12:37 2019 -0400 +++ b/Marea/marea.xml Wed Oct 16 16:25:56 2019 -0400 @@ -22,11 +22,11 @@ --custom_map $cond_rule.cond_map.Custom_map #end if #end if - + --tool_dir $__tool_directory__ --option $cond.type_selector - --out_log $log - + --out_log $log + #if $cond.type_selector == 'datasets': --input_datas #for $data in $cond.input_Datasets: @@ -43,7 +43,7 @@ --generate_svg ${cond.advanced.generateSvg} --generate_pdf ${cond.advanced.generatePdf} --generate_ras ${cond.advanced.generateRas} - #else + #else --none true --pValue 0.05 --fChange 1.5 @@ -61,7 +61,7 @@ --generate_svg ${cond.advanced.generateSvg} --generate_pdf ${cond.advanced.generatePdf} --generate_ras ${cond.advanced.generateRas} - #else + #else --none true --pValue 0.05 --fChange 1.5 @@ -73,7 +73,7 @@ #if $cond.type_selector == 'datasets_rasonly': --input_datas ${input_Datasets} --single_ras_file $ras_single - --none ${None} + --none ${cond.None} #end if ]]> </command> @@ -108,56 +108,56 @@ </param> <when value="datasets"> <repeat name="input_Datasets" title="RNAseq" min="2"> - <param name="input" argument="--input_datas" type="data" format="tabular, csv, tsv" label="add dataset" /> + <param name="input" argument="--input_datas" type="data" format="tabular, csv, tsv" label="add dataset" /> <param name="input_name" argument="--names" type="text" label="Dataset's name:" value="Dataset" help="Default: Dataset" /> </repeat> <conditional name="advanced"> - <param name="choice" type="boolean" checked="false" label="Use advanced options?" help="Use this options to choose custom rules for evaluation: pValue, Fold-Change threshold, how to solve (A and NaN) and specify output maps."> + <param name="choice" type="boolean" checked="false" label="Use advanced options?" help="Use this options to choose custom parameters for evaluation: pValue, Fold-Change threshold, how to solve (A and NaN) and specify output maps."> <option value="true" selected="true">No</option> <option value="false">Yes</option> </param> <when value="false"> </when> <when value="true"> - <param name="None" argument="--none" type="boolean" truevalue="true" falsevalue="false" checked="true" label="(A and NaN) solved as (A)?" /> + <param name="None" argument="--none" type="boolean" truevalue="true" falsevalue="false" checked="true" label="(A and NaN) solved as (A)?" /> <param name="pValue" argument="--pValue" type="float" size="20" value="0.01" max="1" min="0" label="P-value threshold:" help="min value 0" /> <param name="fChange" argument="--fChange" type="float" size="20" value="1.2" min="1" label="Fold-Change threshold:" help="min value 1" /> <param name="generateSvg" argument="--generateSvg" type="boolean" checked="false" label="Generate SVG map" help="should the program generate an editable svg map of the processes?" /> - <param name="generatePdf" argument="--generatePdf" type="boolean" checked="true" label="Generate PDF map" help="should the program return a non editable (but displayble) pdf map of the processes?" /> - <param name="generateRas" argument="--generateRas" type="boolean" checked="false" label="Generate Reaction Activity Score for each table" help="Generate Reaction Activity Score for each table" /> + <param name="generatePdf" argument="--generatePdf" type="boolean" checked="true" label="Generate PDF map" help="should the program return a non editable (but displayble) pdf map of the processes?" /> + <param name="generateRas" argument="--generateRas" type="boolean" checked="false" label="Generate Reaction Activity Score for each table" help="Generate Reaction Activity Score for each table" /> </when> </conditional> </when> <when value="datasets_rasonly"> <param name="input_Datasets" argument="--input_datas" type="data" format="tabular, csv, tsv" label="add dataset" /> <param name="input_name" argument="--names" type="text" label="Dataset's name:" value="Dataset" help="Default: Dataset" /> - <param name="None" argument="--none" type="boolean" truevalue="true" falsevalue="false" checked="true" label="(A and NaN) solved as (A)?" /> + <param name="None" argument="--none" type="boolean" truevalue="true" falsevalue="false" checked="true" label="(A and NaN) solved as (A)?" /> </when> <when value="dataset_class"> <param name="input_data" argument="--input_data" type="data" format="tabular, csv, tsv" label="RNAseq of all samples" /> <param name="input_class" argument="--input_class" type="data" format="tabular, csv, tsv" label="Sample group specification" /> <conditional name="advanced"> - <param name="choice" type="boolean" checked="false" label="Use advanced options?" help="Use this options to choose custom rules for evaluation: pValue, Fold-Change threshold, how to solve (A and NaN) and specify output maps."> + <param name="choice" type="boolean" checked="false" label="Use advanced options?" help="Use this options to choose custom parameters for evaluation: pValue, Fold-Change threshold, how to solve (A and NaN) and specify output maps."> <option value="true" selected="true">No</option> <option value="false">Yes</option> </param> <when value="false"> </when> <when value="true"> - <param name="None" argument="--none" type="boolean" truevalue="true" falsevalue="false" checked="true" label="(A and NaN) solved as (A)?" /> + <param name="None" argument="--none" type="boolean" truevalue="true" falsevalue="false" checked="true" label="(A and NaN) solved as (A)?" /> <param name="pValue" argument="--pValue" type="float" size="20" value="0.01" max="1" min="0" label="P-value threshold:" help="min value 0" /> <param name="fChange" argument="--fChange" type="float" size="20" value="1.2" min="1" label="Fold-Change threshold:" help="min value 1" /> <param name="generateSvg" argument="--generateSvg" type="boolean" checked="false" label="Generate SVG map" help="should the program generate an editable svg map of the processes?" /> - <param name="generatePdf" argument="--generatePdf" type="boolean" checked="true" label="Generate PDF map" help="should the program return a non editable (but displayble) pdf map of the processes?" /> - <param name="generateRas" argument="--generateRas" type="boolean" checked="false" label="Generate Reaction Activity Score for each table" help="Generate Reaction Activity Score for each table" /> + <param name="generatePdf" argument="--generatePdf" type="boolean" checked="true" label="Generate PDF map" help="should the program return a non editable (but displayble) pdf map of the processes?" /> + <param name="generateRas" argument="--generateRas" type="boolean" checked="false" label="Generate Reaction Activity Score for each table" help="Generate Reaction Activity Score for each table" /> </when> </conditional> </when> </conditional> - - - - + + + + </inputs> <outputs> @@ -173,7 +173,7 @@ <filter>cond['type_selector'] != "datasets_rasonly" and cond['advanced']['choice'] and cond['advanced']['generateRas']</filter> <discover_datasets pattern="__name_and_ext__" directory="ras" format="tabular"/> </collection> - + </outputs> <tests> <test> @@ -189,7 +189,7 @@ This tool analyzes RNA-seq dataset(s) as described in Graudenzi et al."`MaREA`_: Metabolic feature extraction, enrichment and visualization of RNAseq data" bioRxiv (2018): 248724. -Accepted files are: +Accepted files are: - option 1) two or more RNA-seq datasets, each referring to samples in a given condition/class. The user can specify a label for each class (as e.g. "*classA*" and "*classB*"); - option 2) one RNA dataset and one class-file specifying the class/condition each sample belongs to. @@ -225,7 +225,7 @@ **"RNAseq of group 1 + RNAseq of group 2 + ... + RNAseq of group N"** option: -RNA-seq Dataset 1: +RNA-seq Dataset 1: @DATASET_EXEMPLE1@ @@ -241,14 +241,14 @@ Class-file: -+------------+------------+ -| Patient_ID | class | -+============+============+ -| TCGAAA3529 | MSI | -+------------+------------+ -| TCGAA62671 | MSS | -+------------+------------+ -| TCGAA62672 | MSI | ++------------+------------+ +| Patient_ID | class | ++============+============+ +| TCGAAA3529 | MSI | ++------------+------------+ +| TCGAA62671 | MSS | ++------------+------------+ +| TCGAA62672 | MSI | +------------+------------+ | @@ -271,3 +271,4 @@ </help> <expand macro="citations" /> </tool> +
--- a/Marea/marea_cluster.py Wed Oct 16 07:12:37 2019 -0400 +++ b/Marea/marea_cluster.py Wed Oct 16 16:25:56 2019 -0400 @@ -72,11 +72,11 @@ help = 'your tool directory') parser.add_argument('-ms', '--min_samples', - type = int, + type = float, help = 'min samples for dbscan (optional)') parser.add_argument('-ep', '--eps', - type = int, + type = float, help = 'eps for dbscan (optional)') parser.add_argument('-bc', '--best_cluster', @@ -310,7 +310,7 @@ ######################## dbscan ############################################## -def dbscan(dataset, eps, min_samples): +def dbscan(dataset, eps, min_samples, best_cluster): if not os.path.exists('clustering'): os.makedirs('clustering') @@ -331,12 +331,15 @@ ##TODO: PLOT SU DBSCAN (no centers) e HIERARCHICAL - - write_to_csv(dataset, labels, 'clustering/dbscan_results.tsv') + labels = labels + predict = [x+1 for x in labels] + classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str) + classe.to_csv(best_cluster, sep = '\t', index = False, header = ['Patient_ID', 'Class']) + ########################## hierachical ####################################### -def hierachical_agglomerative(dataset, k_min, k_max): +def hierachical_agglomerative(dataset, k_min, k_max, best_cluster): if not os.path.exists('clustering'): os.makedirs('clustering') @@ -349,16 +352,28 @@ range_n_clusters = [i for i in range(k_min, k_max+1)] - for n_clusters in range_n_clusters: - + scores = [] + labels = [] + for n_clusters in range_n_clusters: cluster = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='ward') cluster.fit_predict(dataset) cluster_labels = cluster.labels_ - + labels.append(cluster_labels) silhouette_avg = silhouette_score(dataset, cluster_labels) write_to_csv(dataset, cluster_labels, 'clustering/hierarchical_with_' + str(n_clusters) + '_clusters.tsv') + scores.append(silhouette_avg) #warning("For n_clusters =", n_clusters, #"The average silhouette_score is :", silhouette_avg) + + best = max_index(scores) + k_min + + for i in range(len(labels)): + if (i + k_min == best): + labels = labels[i] + predict = [x+1 for x in labels] + classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str) + classe.to_csv(best_cluster, sep = '\t', index = False, header = ['Patient_ID', 'Class']) + @@ -390,10 +405,10 @@ kmeans(args.k_min, args.k_max, X, args.elbow, args.silhouette, args.davies, args.best_cluster) if args.cluster_type == 'dbscan': - dbscan(X, args.eps, args.min_samples) + dbscan(X, args.eps, args.min_samples, args.best_cluster) if args.cluster_type == 'hierarchy': - hierachical_agglomerative(X, args.k_min, args.k_max) + hierachical_agglomerative(X, args.k_min, args.k_max, args.best_cluster) ##############################################################################
--- a/Marea/marea_cluster.xml Wed Oct 16 07:12:37 2019 -0400 +++ b/Marea/marea_cluster.xml Wed Oct 16 16:25:56 2019 -0400 @@ -1,4 +1,4 @@ -<tool id="MaREA_cluester" name="Cluster Analysis" version="1.0.6"> +<tool id="MaREA_cluester" name="Cluster Analysis" version="1.0.7"> <description></description> <macros> <import>marea_macros.xml</import> @@ -75,9 +75,10 @@ <outputs> <data format="txt" name="log" label="${tool.name} - Log" /> - <data format="tabular" name="best_cluster" label="${tool.name} - Best cluster" /> + <data format="tabular" name="best_cluster" label="${tool.name} - best cluster assignment" /> <collection name="results" type="list" label="${tool.name} - Plots and results"> <discover_datasets pattern="__name_and_ext__" directory="clustering"/> + <filter>data['clust_type'] == "kmeans" or data['clust_type'] == "hierarchy"</filter> </collection> </outputs> <help>