Mercurial > repos > rnateam > graphclust_postprocessing
comparison addCdhitseqs.py @ 3:79b9117aef01 draft
planemo upload for repository https://github.com/eteriSokhoyan/galaxytools/tree/branchForIterations/tools/GraphClust/CollectResults commit c03cf64554289eb098267c0923cf0cf7b245cc0c
| author | rnateam |
|---|---|
| date | Wed, 04 Jan 2017 18:15:07 -0500 |
| parents | |
| children | 869a6e807d76 |
comparison
equal
deleted
inserted
replaced
| 2:b8e32e577597 | 3:79b9117aef01 |
|---|---|
| 1 import re | |
| 2 import glob | |
| 3 import sys | |
| 4 | |
| 5 cdhitcluster = sys.argv[1] | |
| 6 #clusters = sys.argv[2] | |
| 7 | |
| 8 cluster_seqs_stats_path = "RESULTS/*.cluster.all" | |
| 9 cluster_seqs_stats_files = glob.glob(cluster_seqs_stats_path) | |
| 10 | |
| 11 #clusterFiles = clusters.split(',') | |
| 12 repSeqRedSeqdict = {} | |
| 13 repLine = "" | |
| 14 count = 0 | |
| 15 first = False | |
| 16 | |
| 17 with open(cdhitcluster, 'r+') as f: | |
| 18 lines = f.readlines() | |
| 19 for i in range(0, len(lines)): | |
| 20 line = lines[i] | |
| 21 if ">Cluster" in line: | |
| 22 first = True | |
| 23 count = 0 | |
| 24 if i+1 < len(lines): | |
| 25 repLine = lines[i+1] | |
| 26 continue | |
| 27 elif not first: | |
| 28 count += 1 | |
| 29 first = False | |
| 30 else: | |
| 31 first = False | |
| 32 lineArr = [] | |
| 33 if count > 0: | |
| 34 repLine = repLine.strip() | |
| 35 rep_FullId = repLine.split()[2] | |
| 36 rep_FullId = rep_FullId.replace(">", "") | |
| 37 #rep_short_id = re.findall("_".join(["[^_]+"] * 2), rep_FullId)[0] | |
| 38 rep_FullId = rep_FullId.replace("...", "") | |
| 39 line = line.strip() | |
| 40 add_FullId = line.split()[2] | |
| 41 add_FullId = add_FullId.replace(">", "") | |
| 42 add_FullId = add_FullId.replace("...", "") | |
| 43 #add_short_id = re.findall("_".join(["[^_]+"] * 2), add_FullId)[0] | |
| 44 lineArr.append(add_FullId) | |
| 45 repSeqRedSeqdict[rep_FullId] = lineArr | |
| 46 #lineArr.append(add_short_id) | |
| 47 #repSeqRedSeqdict[rep_short_id] = lineArr | |
| 48 | |
| 49 toWrite = "" | |
| 50 | |
| 51 for singleFile in sorted(cluster_seqs_stats_files): | |
| 52 with open(singleFile, "a+") as clFile: | |
| 53 file_content = clFile.read() | |
| 54 first_line = file_content.split('\n')[0] | |
| 55 for key, val in repSeqRedSeqdict.items(): | |
| 56 if key in file_content: | |
| 57 for i in val: | |
| 58 toWrite += first_line.split()[0] + " " + first_line.split()[1] + " " + first_line.split()[2] + " " + " - " + " " + "CD-Hit" + " " + first_line.split()[5] + " " + "ORIGID" + " " + str(i) + "\n" | |
| 59 clFile.write(toWrite) |
