Mercurial > repos > bimib > marea
changeset 28:e6831924df01 draft
small fixes (elbow plot and output managment)
author | bimib |
---|---|
date | Mon, 14 Oct 2019 05:01:08 -0400 |
parents | 8c480c977a12 |
children | 9fcb0e8d6d47 |
files | Marea/marea.py Marea/marea.xml Marea/marea_cluster.py Marea/marea_cluster.xml |
diffstat | 4 files changed, 89 insertions(+), 29 deletions(-) [+] |
line wrap: on
line diff
--- a/Marea/marea.py Mon Oct 07 13:48:01 2019 -0400 +++ b/Marea/marea.py Mon Oct 14 05:01:08 2019 -0400 @@ -50,7 +50,7 @@ help = 'your tool directory') parser.add_argument('-op', '--option', type = str, - choices = ['datasets', 'dataset_class'], + choices = ['datasets', 'dataset_class', 'datasets_rasonly'], help='dataset or dataset and class') parser.add_argument('-ol', '--out_log', help = "Output log") @@ -86,6 +86,10 @@ default = 'true', choices = ['true', 'false'], help = 'generate reaction activity score') + parser.add_argument('-sr', '--single_ras_file', + type = str, + help = 'file that will contain ras') + args = parser.parse_args() return args @@ -658,7 +662,7 @@ ############################ create_ras ####################################### -def create_ras (resolve_rules, dataset_name): +def create_ras (resolve_rules, dataset_name, single_ras): if resolve_rules == None: warning("Couldn't generate RAS for current dataset: " + dataset_name) @@ -670,8 +674,13 @@ output_ras = pd.DataFrame.from_dict(resolve_rules) output_to_csv = pd.DataFrame.to_csv(output_ras, sep = '\t', index = False) - - text_file = open("ras/Reaction_Activity_Score_Of_" + dataset_name + ".tsv", "w") + + if (single_ras): + args = process_args(sys.argv) + text_file = open(args.single_ras_file, "w") + else: + text_file = open("ras/Reaction_Activity_Score_Of_" + dataset_name + ".tsv", "w") + text_file.write(output_to_csv) text_file.close() @@ -749,7 +758,34 @@ class_pat = {} - if args.option == 'datasets': + if args.option == 'datasets_rasonly': + name = "RAS Dataset" + dataset = read_dataset(args.input_datas[0],"dataset") + + dataset.iloc[:, 0] = (dataset.iloc[:, 0]).astype(str) + + type_gene = gene_type(dataset.iloc[0, 0], name) + + if args.rules_selector != 'Custom': + genes = data_gene(dataset, type_gene, name, None) + ids, rules = load_id_rules(recon.get(type_gene)) + elif args.rules_selector == 'Custom': + genes = data_gene(dataset, type_gene, name, gene_in_rule) + + resolve_rules, err = resolve(genes, rules, ids, resolve_none, name) + + create_ras(resolve_rules, name, True) + + if err != None and err: + warning('Warning: gene\n' + str(err) + '\nnot found in class ' + + name + ', the expression level for this gene ' + + 'will be considered NaN\n') + + print('execution succeded') + return None + + + elif args.option == 'datasets': num = 1 for i, j in zip(args.input_datas, args.names): @@ -769,8 +805,7 @@ resolve_rules, err = resolve(genes, rules, ids, resolve_none, name) if generate_ras: - create_ras(resolve_rules, name) - + create_ras(resolve_rules, name, False) if err != None and err: warning('Warning: gene\n' + str(err) + '\nnot found in class ' @@ -801,7 +836,8 @@ 'will be considered NaN\n') if resolve_rules != None: class_pat = split_class(classes, resolve_rules) - + + if args.rules_selector == 'Custom': if args.yes_no == 'yes': try:
--- a/Marea/marea.xml Mon Oct 07 13:48:01 2019 -0400 +++ b/Marea/marea.xml Mon Oct 14 05:01:08 2019 -0400 @@ -54,6 +54,10 @@ --input_data ${input_data} --input_class ${input_class} #end if + #if $cond.type_selector == 'datasets_rasonly': + --input_datas ${input_Datasets} + --single_ras_file $ras_single + #end if ]]> </command> @@ -83,6 +87,7 @@ <param name="type_selector" argument="--option" type="select" label="Input format:"> <option value="datasets" selected="true">RNAseq of group 1 + RNAseq of group 2 + ... + RNAseq of group N</option> <option value="dataset_class">RNAseq of all samples + sample group specification</option> + <option value="datasets_rasonly" selected="true">RNAseq dataset</option> </param> <when value="datasets"> <repeat name="input_Datasets" title="RNAseq" min="2"> @@ -90,12 +95,18 @@ <param name="input_name" argument="--names" type="text" label="Dataset's name:" value="Dataset" help="Default: Dataset" /> </repeat> </when> + <when value="datasets_rasonly"> + <param name="input_Datasets" argument="--input_datas" type="data" format="tabular, csv, tsv" label="add dataset" /> + </when> <when value="dataset_class"> <param name="input_data" argument="--input_data" type="data" format="tabular, csv, tsv" label="RNAseq of all samples" /> <param name="input_class" argument="--input_class" type="data" format="tabular, csv, tsv" label="Sample group specification" /> </when> </conditional> + + <!--TODO: NASCONDERE ADVANCED SE RAS ONLY--> + <conditional name="advanced"> <param name="choice" type="boolean" checked="false" label="Use advanced options?" help="Use this options to choose custom rules for evaluation: pValue, Fold-Change threshold, how to solve (A and NaN) and specify output maps."> <option value="true" selected="true">No</option> @@ -116,13 +127,18 @@ <outputs> <data format="txt" name="log" label="${tool.name} - Log" /> + <data format="tabular" name="ras_single" label="${tool.name} - RAS"> + <filter>cond['type_selector'] == "datasets_rasonly"</filter> + </data> <collection name="results" type="list" label="${tool.name} - Results"> + <filter>cond['type_selector'] == "datasets" or cond['type_selector'] == "dataset_class"</filter> <discover_datasets pattern="__name_and_ext__" directory="result"/> </collection> - <collection name="ras" type="list" label="${tool.name} - RAS" format_source="tabular"> + <collection name="ras" type="list" label="${tool.name} - RAS list" format_source="tabular"> <filter>advanced['choice'] and advanced['generateRas']</filter> <discover_datasets pattern="__name_and_ext__" directory="ras" format="tabular"/> </collection> + </outputs> <tests> <test>
--- a/Marea/marea_cluster.py Mon Oct 07 13:48:01 2019 -0400 +++ b/Marea/marea_cluster.py Mon Oct 14 05:01:08 2019 -0400 @@ -78,6 +78,11 @@ parser.add_argument('-ep', '--eps', type = int, help = 'eps for dbscan (optional)') + + parser.add_argument('-bc', '--best_cluster', + type = str, + help = 'output of best cluster tsv') + args = parser.parse_args() @@ -131,20 +136,7 @@ dest = name classe.to_csv(dest, sep = '\t', index = False, header = ['Patient_ID', 'Class']) - - - #list_labels = labels - #list_values = dataset - - #list_values = list_values.tolist() - #d = {'Label' : list_labels, 'Value' : list_values} - - #df = pd.DataFrame(d, columns=['Value','Label']) - - #dest = name + '.tsv' - #df.to_csv(dest, sep = '\t', index = False, - # header = ['Value', 'Label']) - + ########################### trova il massimo in lista ######################## def max_index (lista): best = -1 @@ -158,7 +150,7 @@ ################################ kmeans ##################################### -def kmeans (k_min, k_max, dataset, elbow, silhouette, davies): +def kmeans (k_min, k_max, dataset, elbow, silhouette, davies, best_cluster): if not os.path.exists('clustering'): os.makedirs('clustering') @@ -189,7 +181,10 @@ cluster_labels = clusterer.fit_predict(dataset) all_labels.append(cluster_labels) - silhouette_avg = silhouette_score(dataset, cluster_labels) + if n_clusters == 1: + silhouette_avg = 0 + else: + silhouette_avg = silhouette_score(dataset, cluster_labels) scores.append(silhouette_avg) distortions.append(clusterer.fit(dataset).inertia_) @@ -201,6 +196,14 @@ prefix = '_BEST' write_to_csv(dataset, all_labels[i], 'clustering/kmeans_with_' + str(i + k_min) + prefix + '_clusters.tsv') + + + if (prefix == '_BEST'): + labels = all_labels[i] + predict = [x+1 for x in labels] + classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str) + classe.to_csv(best_cluster, sep = '\t', index = False, header = ['Patient_ID', 'Class']) + if davies: with np.errstate(divide='ignore', invalid='ignore'): @@ -235,6 +238,9 @@ ############################## silhouette plot ############################### def silihouette_draw(dataset, labels, n_clusters, path): + if n_clusters == 1: + return None + silhouette_avg = silhouette_score(dataset, labels) warning("For n_clusters = " + str(n_clusters) + " The average silhouette_score is: " + str(silhouette_avg)) @@ -375,7 +381,7 @@ if args.cluster_type == 'kmeans': - kmeans(args.k_min, args.k_max, X, args.elbow, args.silhouette, args.davies) + kmeans(args.k_min, args.k_max, X, args.elbow, args.silhouette, args.davies, args.best_cluster) if args.cluster_type == 'dbscan': dbscan(X, args.eps, args.min_samples) @@ -386,4 +392,4 @@ ############################################################################## if __name__ == "__main__": - main() \ No newline at end of file + main()
--- a/Marea/marea_cluster.xml Mon Oct 07 13:48:01 2019 -0400 +++ b/Marea/marea_cluster.xml Mon Oct 14 05:01:08 2019 -0400 @@ -17,6 +17,7 @@ --input $input --tool_dir $__tool_directory__ --out_log $log + --best_cluster $best_cluster --cluster_type ${data.clust_type} #if $data.clust_type == 'kmeans': --k_min ${data.k_min} @@ -46,7 +47,7 @@ <option value="hierarchy">Agglomerative Hierarchical</option> </param> <when value="kmeans"> - <param name="k_min" argument="--k_min" type="integer" min="2" max="99" value="3" label="Min number of clusters (k) to be tested" /> + <param name="k_min" argument="--k_min" type="integer" min="1" max="99" value="3" label="Min number of clusters (k) to be tested" /> <param name="k_max" argument="--k_max" type="integer" min="3" max="99" value="5" label="Max number of clusters (k) to be tested" /> <param name="elbow" argument="--elbow" type="boolean" value="true" label="Draw the elbow plot from k-min to k-max"/> <param name="silhouette" argument="--silhouette" type="boolean" value="true" label="Draw the Silhouette plot from k-min to k-max"/> @@ -74,7 +75,8 @@ <outputs> <data format="txt" name="log" label="${tool.name} - Log" /> - <collection name="results" type="list" label="${tool.name} - Results"> + <data format="tabular" name="best_cluster" label="${tool.name} - Best cluster" /> + <collection name="results" type="list" label="${tool.name} - Plots and results"> <discover_datasets pattern="__name_and_ext__" directory="clustering"/> </collection> </outputs>