0
|
1 #!/usr/bin/env python3
|
|
2
|
|
3 import argparse
|
|
4 import subprocess
|
|
5 import re
|
|
6 from tempfile import NamedTemporaryFile
|
|
7 import os
|
|
8 import configuration
|
|
9
|
|
10
|
|
11 def group_reads(reads_files_list, IDENTITY_TH):
|
|
12 ''' Reduce the number of reads separately for each significant cluster based on similarities between them using cd-hit tool.
|
|
13 cd-hit produces reduced reads files containing only the representative reads.
|
|
14 '''
|
|
15 reads_seqs_cl_reduced_list = []
|
|
16 ## Run cd-hit on each cluster separately
|
|
17 for cluster_reads in reads_files_list:
|
|
18 cl = cluster_reads.split("_")[-1]
|
|
19 reads_seqs_cl_reduced = NamedTemporaryFile(
|
|
20 suffix=".reduced_{}".format(cl),
|
|
21 delete=False)
|
|
22 subprocess.call("cd-hit-est -i {} -o {} -c {} -M {}".format(
|
|
23 cluster_reads, reads_seqs_cl_reduced.name, IDENTITY_TH,
|
|
24 configuration.MEM_LIM),
|
|
25 shell=True)
|
|
26 reads_seqs_cl_reduced_list.append(reads_seqs_cl_reduced.name)
|
|
27 reads_seqs_cl_reduced.close()
|
|
28 ## Return the list of reduced reads files
|
|
29 return reads_seqs_cl_reduced_list
|
|
30
|
|
31
|
|
32 def representative_reads(READS_ALL, CLS_FILE, CL_SIZE_TH, CLS_REDUCED,
|
|
33 IDENTITY_TH):
|
|
34 ''' Group the reads based on the sequences similarities.
|
|
35 Replace a group by only the one representative read preserving the quantitative info how much reads it represents
|
|
36 1. Loop over the original cls file and find the significant clusters (min. number of reads)
|
|
37 2. Get the reads which are in individual significant clusters
|
|
38 2. Get the reads sequences for individual clusters to run cd-hit which groups the reads for each cluster
|
|
39 3. After getting all significant ones (clusters sorted by size) process the outputs from cd-hit and to get reads representations
|
|
40 4. Create new cls file and write down significant clusters with the new reads IDs
|
|
41 5. Continue reading unsignificant original cls file and copy the rest of clusters to the new cls unchanged
|
|
42 Find groups of similar reads and replace them with only one representative also preserving the number of reads it represents
|
|
43 '''
|
|
44 reads_dict = {}
|
|
45 cl = None
|
|
46 line_cl_header = None
|
|
47 modify_files = True
|
|
48 reads_files_list = []
|
|
49 cls_reduced_file = open(CLS_REDUCED, "w")
|
|
50 ## parse file of all clusters from RE
|
|
51 with open(CLS_FILE, "r") as cls_ori:
|
|
52 for line in cls_ori:
|
|
53 if line.startswith(">"):
|
|
54 line_cl_header = line
|
|
55 ## cluster number
|
|
56 cl = re.split('\t| ', line)[0].rstrip().lstrip(">")
|
|
57 else:
|
|
58 reads_in_cl = line.rstrip().split("\t")
|
|
59 ## reduce only reads in the biggest clusters:
|
|
60 ## the threshold of cluster size is set as a minimum number of reads it has to contain
|
|
61 if len(reads_in_cl) >= CL_SIZE_TH:
|
|
62 ## for significant cluster create a separate file to write reads sequences
|
|
63 reads_seqs_cl_orig = NamedTemporaryFile(
|
|
64 suffix="_{}".format(cl),
|
|
65 delete=False)
|
|
66 reads_files_list.append(reads_seqs_cl_orig.name)
|
|
67 ## for every read in the cluster create entry in reads_dict to which cluster it belongs and the file of the read sequence for this cluster
|
|
68 for read in reads_in_cl:
|
|
69 ## Dictionary of reads from significant clusters -> KEY:read_id VALUE:[number of cluster, filename to reads sequences file]
|
|
70 reads_dict[read] = [cl, reads_seqs_cl_orig.name]
|
|
71 reads_seqs_cl_orig.close()
|
|
72 ## after getting all significant clusters to be reduced (original cls file sorted by size of clusters), process the reads reads in them and write to the modified reads and cls files
|
|
73 elif modify_files:
|
|
74 ## get reads sequences for significant clusters from ALL reads
|
|
75 get_read_sequences(READS_ALL, reads_dict, reads_files_list)
|
|
76 ## run cd-hit to reduce the reads for significant clusters
|
|
77 reads_seqs_cl_reduced_list = group_reads(reads_files_list,
|
|
78 IDENTITY_TH)
|
|
79 reads_repre_dict = {}
|
|
80 # for individual significant cluster, process the corresponding file of original and reduced reads
|
|
81 for reads_seqs_cl_orig, reads_seqs_cl_reduced in zip(
|
|
82 reads_files_list, reads_seqs_cl_reduced_list):
|
|
83 cl = reads_seqs_cl_reduced.split("_")[-1]
|
|
84 ## get reads quantitative represantion dictionary for individual cluster
|
|
85 [reads_repre_dict, reads_in_cl_mod
|
|
86 ] = process_reads_groups(reads_seqs_cl_reduced,
|
|
87 reads_repre_dict)
|
|
88 ## for each significant cluster write the new IDs of reduced reads to modified cls
|
|
89 modify_cls(cls_reduced_file, reads_in_cl_mod, cl)
|
|
90 os.unlink(reads_seqs_cl_orig)
|
|
91 os.unlink(reads_seqs_cl_reduced)
|
|
92 ## write the last line that was chcecked but not reduced
|
|
93 cls_reduced_file.write(line_cl_header)
|
|
94 cls_reduced_file.write(line)
|
|
95 modify_files = False
|
|
96 ## after reducing append the rest of clusters unchanged to the modified cls file
|
|
97 else:
|
|
98 cls_reduced_file.write(line_cl_header)
|
|
99 cls_reduced_file.write(line)
|
|
100 cls_reduced_file.close()
|
|
101 return reads_repre_dict
|
|
102
|
|
103
|
|
104 def modify_cls(cls_reduced_file, reads_in_cl_mod, cl):
|
|
105 ''' For each significant cluster write down the new adjusted names of reads
|
|
106 '''
|
|
107 num_of_reads = len(reads_in_cl_mod)
|
|
108 cls_reduced_file.write(">{}\t{}\n".format(cl, num_of_reads))
|
|
109 cls_reduced_file.write("{}\n".format("\t".join(reads_in_cl_mod)))
|
|
110
|
|
111
|
|
112 def get_read_sequences(READS_ALL, reads_dict, reads_files_list):
|
|
113 '''From file of ALL reads sequences take only the ones belonging to significant clusters.
|
|
114 Distribute them to separate files based on the cluster they belong to
|
|
115 '''
|
|
116 with open(READS_ALL, "r") as reads_all:
|
|
117 for line in reads_all:
|
|
118 if line.startswith(">"):
|
|
119 read = line.rstrip().lstrip(">")
|
|
120 ## check if the read belong to significant cluster
|
|
121 if read in reads_dict.keys():
|
|
122 ## then write it to file of reads for corresponding cluster
|
|
123 with open(reads_dict[read][1], "a") as reads_file:
|
|
124 reads_file.write(line)
|
|
125 reads_file.write(reads_all.readline())
|
|
126
|
|
127
|
|
128 def process_reads_groups(reads_seqs_cl_reduced, reads_repre_dict):
|
|
129 ''' Process the .clstr output of cd-hit which contains groups of original reads
|
|
130 Each group starts with > character, on separate lines are listed original reads IDs, the representative one is marked by *.
|
|
131 Get the number of reads in every group to preserve the quantitative information
|
|
132 Create dictionary of representative reads as keys and the amount of reads they represent as value:
|
|
133 value = 0 : read is not representative and will not take place in the reduce database
|
|
134 value > 0 : value indicates the number of reads it represents
|
|
135 Create list of new representative reads IDs encoding the original number of read in the group using 'reduce' tag:
|
|
136 e.g. 171freduce10 (10 original reads were reduced to one 171f representative)
|
|
137 '''
|
|
138 clstr_file = "{}.clstr".format(reads_seqs_cl_reduced)
|
|
139 reads_in_cl_mod = []
|
|
140 read_represent = ""
|
|
141 with open(clstr_file, "r") as clstr:
|
|
142 for line in clstr:
|
|
143 count_reads = 0
|
|
144 while not line.startswith(">"):
|
|
145 if not line:
|
|
146 break
|
|
147 read = line.split(" ")[1].split(".")[0]
|
|
148 count_reads += 1
|
|
149 if line.rstrip().endswith("*"):
|
|
150 read_represent = read
|
|
151 else:
|
|
152 reads_repre_dict[read] = 0
|
|
153 line = clstr.readline()
|
|
154 if read_represent:
|
|
155 reads_repre_dict[read_represent] = count_reads
|
|
156 reads_in_cl_mod.append("{}reduce{}".format(
|
|
157 read_represent.rstrip().lstrip(">"), count_reads))
|
|
158 return reads_repre_dict, reads_in_cl_mod
|
|
159
|
|
160
|
|
161 def reduce_reads(READS_ALL, READS_ALL_REDUCED, reads_repre_dict):
|
|
162 ''' Report a new file of reads sequences based on the original file of ALL reads using the reads representation dictionary.
|
|
163 Loop over the reads in the original READS_ALL file
|
|
164 There are 3 options evaluated for the read:
|
|
165 - the value in the dictionary equals to zero, read is not representative -> it will not take place in the new reads DB
|
|
166 - the value is greater than zero, the read is representative -> in new read DB encode the number of representing reads using 'reduce' tag (<Original_read_ID>reduce<number_represented>)
|
|
167 - the read is not in the dictionary -> add it unchanged from the original ALL reads database
|
|
168 '''
|
|
169 with open(READS_ALL_REDUCED, "w") as reads_all_red:
|
|
170 with open(READS_ALL, "r") as reads_all_ori:
|
|
171 for line in reads_all_ori:
|
|
172 if line.startswith(">"):
|
|
173 if line.rstrip() in reads_repre_dict:
|
|
174 amount_represented = reads_repre_dict[line.rstrip()]
|
|
175 if amount_represented > 0:
|
|
176 reads_all_red.write("{}reduce{}\n".format(
|
|
177 line.rstrip(), amount_represented))
|
|
178 reads_all_red.write(reads_all_ori.readline())
|
|
179 else:
|
|
180 reads_all_red.write(line)
|
|
181 reads_all_red.write(reads_all_ori.readline())
|
|
182
|
|
183
|
|
184 def main(args):
|
|
185 CLS_FILE = args.cls
|
|
186 READS_ALL = args.reads_all
|
|
187 CL_SIZE_TH = args.cluster_size
|
|
188 IDENTITY_TH = args.identity_th
|
|
189 CLS_REDUCED = args.cls_reduced
|
|
190 READS_ALL_REDUCED = args.reads_reduced
|
|
191
|
|
192 if not os.path.isabs(CLS_REDUCED):
|
|
193 CLS_REDUCED = os.path.join(os.getcwd(), CLS_REDUCED)
|
|
194
|
|
195 if not os.path.isabs(READS_ALL_REDUCED):
|
|
196 READS_ALL_REDUCED = os.path.join(os.getcwd(), READS_ALL_REDUCED)
|
|
197
|
|
198 reads_repre_dict = representative_reads(READS_ALL, CLS_FILE, CL_SIZE_TH,
|
|
199 CLS_REDUCED, IDENTITY_TH)
|
|
200 reduce_reads(READS_ALL, READS_ALL_REDUCED, reads_repre_dict)
|
|
201
|
|
202
|
|
203 if __name__ == '__main__':
|
|
204
|
|
205 # Command line arguments
|
|
206 parser = argparse.ArgumentParser()
|
|
207 parser.add_argument('-r',
|
|
208 '--reads_all',
|
|
209 type=str,
|
|
210 required=True,
|
|
211 help='input file containing all reads sequences')
|
|
212 parser.add_argument(
|
|
213 '-c',
|
|
214 '--cls',
|
|
215 type=str,
|
|
216 required=True,
|
|
217 help='input sorted cls file containing reads assigned to clusters')
|
|
218 parser.add_argument('-rr',
|
|
219 '--reads_reduced',
|
|
220 type=str,
|
|
221 default=configuration.READS_ALL_REDUCED,
|
|
222 help='output file containing reduced number of reads')
|
|
223 parser.add_argument(
|
|
224 '-cr',
|
|
225 '--cls_reduced',
|
|
226 type=str,
|
|
227 default=configuration.CLS_REDUCED,
|
|
228 help=
|
|
229 'output cls file containing adjusted clusters for the reduced reads database')
|
|
230 parser.add_argument('-i',
|
|
231 '--identity_th',
|
|
232 type=float,
|
|
233 default=0.90,
|
|
234 help='reads identity threshold for cdhit')
|
|
235 parser.add_argument('-cs',
|
|
236 '--cluster_size',
|
|
237 type=int,
|
|
238 default=1000,
|
|
239 help='minimum cluster size to be included in reducing')
|
|
240
|
|
241 args = parser.parse_args()
|
|
242 main(args)
|