comparison addCdhitseqs.py @ 6:869a6e807d76 draft

planemo upload for repository https://github.com/eteriSokhoyan/galaxytools/tree/branchForIterations/tools/GraphClust/CollectResults commit 057c2fd398055dc86eb2c00d8a74f301d5c231d9-dirty
author rnateam
date Wed, 22 Feb 2017 16:51:06 -0500
parents 79b9117aef01
children
comparison
equal deleted inserted replaced
5:4310ac018d05 6:869a6e807d76
1 import re 1 import re
2 import glob 2 import glob
3 import sys 3 import sys
4 4
5 cdhitcluster = sys.argv[1] 5 cdhitcluster = sys.argv[1]
6 #clusters = sys.argv[2]
7 6
8 cluster_seqs_stats_path = "RESULTS/*.cluster.all" 7 cluster_seqs_stats_path = "RESULTS/*.cluster.all"
9 cluster_seqs_stats_files = glob.glob(cluster_seqs_stats_path) 8 cluster_seqs_stats_files = glob.glob(cluster_seqs_stats_path)
10 9
11 #clusterFiles = clusters.split(',')
12 repSeqRedSeqdict = {} 10 repSeqRedSeqdict = {}
13 repLine = "" 11 repLine = ""
14 count = 0 12 count = 0
15 first = False 13 first = False
14 add_FullId = ""
15 k = 0
16 16
17 with open(cdhitcluster, 'r+') as f: 17 with open(cdhitcluster, 'r+') as f:
18 lines = f.readlines() 18 content = f.read()
19 reps = re.compile("^.*\*$", re.MULTILINE).findall(content)
20 lines = content.split('\n')
21
19 for i in range(0, len(lines)): 22 for i in range(0, len(lines)):
20 line = lines[i] 23 line = lines[i]
21 if ">Cluster" in line: 24 if ">Cluster" in line:
22 first = True 25 first = True
23 count = 0 26 count = 0
24 if i+1 < len(lines): 27 repLine = reps[k]
25 repLine = lines[i+1] 28 k = k+1
26 continue 29 continue
27 elif not first: 30 elif not first:
28 count += 1 31 count += 1
29 first = False 32 first = False
30 else: 33 else:
31 first = False 34 first = False
32 lineArr = [] 35 lineArr = []
33 if count > 0: 36 if count > 0:
34 repLine = repLine.strip() 37 repLine = repLine.strip()
35 rep_FullId = repLine.split()[2] 38 rep_FullId = repLine.split()[2]
36 rep_FullId = rep_FullId.replace(">", "") 39 rep_FullId = rep_FullId.replace(">","")
37 #rep_short_id = re.findall("_".join(["[^_]+"] * 2), rep_FullId)[0] 40 rep_FullId = rep_FullId.replace("...","")
38 rep_FullId = rep_FullId.replace("...", "") 41 if "*" in line or not line.strip():
42 continue
39 line = line.strip() 43 line = line.strip()
40 add_FullId = line.split()[2] 44 add_FullId = line.split()[2]
41 add_FullId = add_FullId.replace(">", "") 45 add_FullId = add_FullId.replace(">","")
42 add_FullId = add_FullId.replace("...", "") 46 add_FullId = add_FullId.replace("...","")
43 #add_short_id = re.findall("_".join(["[^_]+"] * 2), add_FullId)[0]
44 lineArr.append(add_FullId) 47 lineArr.append(add_FullId)
45 repSeqRedSeqdict[rep_FullId] = lineArr 48 repSeqRedSeqdict[rep_FullId] = lineArr
46 #lineArr.append(add_short_id)
47 #repSeqRedSeqdict[rep_short_id] = lineArr
48 49
49 toWrite = "" 50 toWrite = ""
50
51 for singleFile in sorted(cluster_seqs_stats_files): 51 for singleFile in sorted(cluster_seqs_stats_files):
52 with open(singleFile, "a+") as clFile: 52 toWrite = ""
53 file_content = clFile.read() 53 with open(singleFile, "r+") as clFile:
54 first_line = file_content.split('\n')[0] 54 file_lines = clFile.readlines()
55 for line in file_lines:
56 line = '\t'.join(line.split())
57 toWrite += line + '\n'
58 clFile.seek(0)
59 clFile.write(toWrite)
60 clFile.truncate()
61 first_line = file_lines[0]
62 toWrite = ""
63 cols = first_line.split()
64 file_content = '\n'.join(file_lines)
55 for key, val in repSeqRedSeqdict.items(): 65 for key, val in repSeqRedSeqdict.items():
56 if key in file_content: 66 if key in file_content:
67
57 for i in val: 68 for i in val:
58 toWrite += first_line.split()[0] + " " + first_line.split()[1] + " " + first_line.split()[2] + " " + " - " + " " + "CD-Hit" + " " + first_line.split()[5] + " " + "ORIGID" + " " + str(i) + "\n" 69 cols[3] = "---"
70 cols[4] = "CD-Hit"
71 cols[7] = str(i)
72 if len(first_line.split()) > 9:
73 cols[9] = str(i.rsplit("_",1)[0])
74 toWrite += '\t'.join(cols)
75 toWrite +="\n"
59 clFile.write(toWrite) 76 clFile.write(toWrite)