Mercurial > repos > rnateam > graphclust_postprocessing
comparison evaluation.py @ 16:79df97a1bc0f draft
planemo upload for repository https://github.com/eteriSokhoyan/galaxytools/tree/branchForIterations/tools/GraphClust/CollectResults commit b8f82a8101d9eb74c8dbac51b8a0c75585a888a2
author | rnateam |
---|---|
date | Fri, 23 Feb 2018 10:46:41 -0500 |
parents | b5f49453af8c |
children | f93c868203cc |
comparison
equal
deleted
inserted
replaced
15:c7ca5d173482 | 16:79df97a1bc0f |
---|---|
2 import glob | 2 import glob |
3 from os import system | 3 from os import system |
4 import re | 4 import re |
5 from sklearn import metrics | 5 from sklearn import metrics |
6 from shutil import make_archive | 6 from shutil import make_archive |
7 import sys | |
8 import fnmatch, os | |
7 | 9 |
8 def sh(script): | 10 def sh(script): |
9 system("bash -c '%s'" % script) | 11 system("bash -c '%s'" % script) |
10 | 12 |
11 dataNames = "FASTA/data.names" | 13 fasta_dir = sys.argv[1] |
14 results_dir = sys.argv[2] | |
15 dataNames = fasta_dir+"/data.names" | |
12 | 16 |
13 listOfClusters = [] | 17 listOfClusters = [] |
14 listOfHeaders = [] | 18 listOfHeaders = [] |
15 headersNames = set() | 19 headersNames = set() |
16 cluster_seqs_stats_path = "RESULTS/*.cluster.all" | 20 idsNames = set() |
17 cluster_seqs_stats_files = glob.glob(cluster_seqs_stats_path) | |
18 | 21 |
22 | |
23 names = os.listdir(results_dir) | |
24 cluster_seqs_stats_files = fnmatch.filter(names, '*.cluster.all') | |
19 with open(dataNames, "r") as names: | 25 with open(dataNames, "r") as names: |
20 for line2 in names: | 26 for line2 in names: |
21 splits2 = line2.split() | 27 splits2 = line2.split() |
22 fullHeader = '' | 28 fullHeader = '' |
23 if len(splits2) >= 6: | 29 if len(splits2) >= 6: |
24 fullHeader = splits2[5] | 30 fullHeader = splits2[5] |
25 headersNames.add(fullHeader) | 31 headersNames.add(fullHeader) |
32 fullID = splits2[3] | |
33 idsNames.add(fullID) | |
26 | 34 |
27 blackList = [] | 35 blackList = [] |
28 numberOfClusters = 0 | 36 numberOfClusters = 0 |
29 for singleFile in sorted(cluster_seqs_stats_files): | 37 for singleFile in sorted(cluster_seqs_stats_files): |
38 singleFile = os.path.join(results_dir,singleFile) | |
30 numberOfClusters += 1 | 39 numberOfClusters += 1 |
31 with open(singleFile, "r") as f: | 40 with open(singleFile, "r") as f: |
32 for line in f: | 41 for line in f: |
33 splits = line.split() | 42 splits = line.split() |
34 header = '' | 43 header = '' |
44 idd = '' | |
35 if len(splits) >= 11: | 45 if len(splits) >= 11: |
36 header = splits[10] | 46 header = splits[10] |
47 idd = splits[8] | |
37 clustNum = splits[2] | 48 clustNum = splits[2] |
38 listOfHeaders.append(header) | 49 listOfHeaders.append(header) |
39 listOfClusters.append(clustNum) | 50 listOfClusters.append(clustNum) |
40 if header in headersNames: | 51 if idd in idsNames: #header in headersNames: |
41 blackList.append(header) | 52 blackList.append(idd) |
42 | 53 |
43 numberOfClusters += 1 # 1 cluster for all unassigned seqs | 54 numberOfClusters += 1 # 1 cluster for all unassigned seqs |
55 ignoreBlackList = False | |
44 with open(dataNames, "r") as names: | 56 with open(dataNames, "r") as names: |
45 for line in names.readlines(): | 57 for line in names.readlines(): |
46 splits = line.split() | 58 splits = line.split() |
47 fullUniqeId = splits[3] | 59 fullUniqeId = splits[3] |
48 fullHeader = '' | 60 fullHeader = '' |
61 fullID = '' | |
49 if len(splits) >= 6: | 62 if len(splits) >= 6: |
50 fullHeader = line.split()[5] | 63 fullHeader = line.split()[5] |
51 if fullHeader not in blackList or len(fullHeader) == 0: | 64 fullID = line.split()[3] |
65 if ignoreBlackList or ( fullID not in blackList #fullHeader not in blackList | |
66 or len(fullHeader) == 0): | |
52 listOfHeaders.append(fullHeader) | 67 listOfHeaders.append(fullHeader) |
53 listOfClusters.append(str(numberOfClusters)) | 68 listOfClusters.append(str(numberOfClusters)) |
54 numberOfClusters += 1 # separate cluster for all unassigned seqs | 69 numberOfClusters += 1 # separate cluster for all unassigned seqs |
70 # else: | |
71 # print ("Skip header", fullHeader) | |
55 | 72 |
56 toWrite = "" | 73 toWrite = "" |
57 for i in range(len(listOfClusters)): | 74 for i in range(len(listOfClusters)): |
58 toWrite += listOfHeaders[i] + "\t" + listOfClusters[i] + '\n' | 75 toWrite += listOfHeaders[i] + "\t" + listOfClusters[i] + '\n' |
59 with open("RESULTS/fullTab.tabular", "w") as full: | 76 |
77 with open(results_dir+"/fullTab.tabular", "w") as full: | |
60 full.write(toWrite) | 78 full.write(toWrite) |
61 | 79 |
62 | 80 |
63 pattern = re.compile("^RF.*$") | 81 pattern = re.compile("^RF.*$") |
64 | 82 |
70 adjusted_mutual_info_score = metrics.adjusted_mutual_info_score(listOfHeaders, listOfClusters) | 88 adjusted_mutual_info_score = metrics.adjusted_mutual_info_score(listOfHeaders, listOfClusters) |
71 v_measure_score = metrics.v_measure_score(listOfHeaders, listOfClusters) | 89 v_measure_score = metrics.v_measure_score(listOfHeaders, listOfClusters) |
72 | 90 |
73 toWrite = "completeness_score : " + str(completeness_score) + "\n" + "homogeneity_score : " + str(homogeneity_score) + "\n" + "adjusted_rand_score : " +str(adjusted_rand_score) + "\n" + "adjusted_mutual_info_score : " + str(adjusted_mutual_info_score)+ "\n" + "v_measure_score : " + str(v_measure_score) | 91 toWrite = "completeness_score : " + str(completeness_score) + "\n" + "homogeneity_score : " + str(homogeneity_score) + "\n" + "adjusted_rand_score : " +str(adjusted_rand_score) + "\n" + "adjusted_mutual_info_score : " + str(adjusted_mutual_info_score)+ "\n" + "v_measure_score : " + str(v_measure_score) |
74 | 92 |
93 | |
75 else: | 94 else: |
76 toWrite = "completeness_score : NA \nhomogeneity_score : NA \nadjusted_rand_score : NA \nadjusted_mutual_info_score : NA \nv_measure_score : NA" | 95 toWrite = "completeness_score : NA \nhomogeneity_score : NA \nadjusted_rand_score : NA \nadjusted_mutual_info_score : NA \nv_measure_score : NA" |
77 | 96 |
78 with open("RESULTS/evaluation.txt", "w") as fOut: | 97 with open(os.path.join(results_dir,"evaluation.txt"), "w") as fOut: |
79 fOut.write(toWrite) | 98 fOut.write(toWrite) |
80 | 99 |
81 | 100 |
82 make_archive('RESULTS', 'zip', root_dir='RESULTS') | 101 make_archive('RESULTS', 'zip', root_dir=results_dir) |