comparison profrep_db_reducing.py @ 0:a5f1638b73be draft

Uploaded
author petr-novak
date Wed, 26 Jun 2019 08:01:42 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:a5f1638b73be
1 #!/usr/bin/env python3
2
3 import argparse
4 import subprocess
5 import re
6 from tempfile import NamedTemporaryFile
7 import os
8 import configuration
9
10
11 def group_reads(reads_files_list, IDENTITY_TH):
12 ''' Reduce the number of reads separately for each significant cluster based on similarities between them using cd-hit tool.
13 cd-hit produces reduced reads files containing only the representative reads.
14 '''
15 reads_seqs_cl_reduced_list = []
16 ## Run cd-hit on each cluster separately
17 for cluster_reads in reads_files_list:
18 cl = cluster_reads.split("_")[-1]
19 reads_seqs_cl_reduced = NamedTemporaryFile(
20 suffix=".reduced_{}".format(cl),
21 delete=False)
22 subprocess.call("cd-hit-est -i {} -o {} -c {} -M {}".format(
23 cluster_reads, reads_seqs_cl_reduced.name, IDENTITY_TH,
24 configuration.MEM_LIM),
25 shell=True)
26 reads_seqs_cl_reduced_list.append(reads_seqs_cl_reduced.name)
27 reads_seqs_cl_reduced.close()
28 ## Return the list of reduced reads files
29 return reads_seqs_cl_reduced_list
30
31
32 def representative_reads(READS_ALL, CLS_FILE, CL_SIZE_TH, CLS_REDUCED,
33 IDENTITY_TH):
34 ''' Group the reads based on the sequences similarities.
35 Replace a group by only the one representative read preserving the quantitative info how much reads it represents
36 1. Loop over the original cls file and find the significant clusters (min. number of reads)
37 2. Get the reads which are in individual significant clusters
38 2. Get the reads sequences for individual clusters to run cd-hit which groups the reads for each cluster
39 3. After getting all significant ones (clusters sorted by size) process the outputs from cd-hit and to get reads representations
40 4. Create new cls file and write down significant clusters with the new reads IDs
41 5. Continue reading unsignificant original cls file and copy the rest of clusters to the new cls unchanged
42 Find groups of similar reads and replace them with only one representative also preserving the number of reads it represents
43 '''
44 reads_dict = {}
45 cl = None
46 line_cl_header = None
47 modify_files = True
48 reads_files_list = []
49 cls_reduced_file = open(CLS_REDUCED, "w")
50 ## parse file of all clusters from RE
51 with open(CLS_FILE, "r") as cls_ori:
52 for line in cls_ori:
53 if line.startswith(">"):
54 line_cl_header = line
55 ## cluster number
56 cl = re.split('\t| ', line)[0].rstrip().lstrip(">")
57 else:
58 reads_in_cl = line.rstrip().split("\t")
59 ## reduce only reads in the biggest clusters:
60 ## the threshold of cluster size is set as a minimum number of reads it has to contain
61 if len(reads_in_cl) >= CL_SIZE_TH:
62 ## for significant cluster create a separate file to write reads sequences
63 reads_seqs_cl_orig = NamedTemporaryFile(
64 suffix="_{}".format(cl),
65 delete=False)
66 reads_files_list.append(reads_seqs_cl_orig.name)
67 ## for every read in the cluster create entry in reads_dict to which cluster it belongs and the file of the read sequence for this cluster
68 for read in reads_in_cl:
69 ## Dictionary of reads from significant clusters -> KEY:read_id VALUE:[number of cluster, filename to reads sequences file]
70 reads_dict[read] = [cl, reads_seqs_cl_orig.name]
71 reads_seqs_cl_orig.close()
72 ## after getting all significant clusters to be reduced (original cls file sorted by size of clusters), process the reads reads in them and write to the modified reads and cls files
73 elif modify_files:
74 ## get reads sequences for significant clusters from ALL reads
75 get_read_sequences(READS_ALL, reads_dict, reads_files_list)
76 ## run cd-hit to reduce the reads for significant clusters
77 reads_seqs_cl_reduced_list = group_reads(reads_files_list,
78 IDENTITY_TH)
79 reads_repre_dict = {}
80 # for individual significant cluster, process the corresponding file of original and reduced reads
81 for reads_seqs_cl_orig, reads_seqs_cl_reduced in zip(
82 reads_files_list, reads_seqs_cl_reduced_list):
83 cl = reads_seqs_cl_reduced.split("_")[-1]
84 ## get reads quantitative represantion dictionary for individual cluster
85 [reads_repre_dict, reads_in_cl_mod
86 ] = process_reads_groups(reads_seqs_cl_reduced,
87 reads_repre_dict)
88 ## for each significant cluster write the new IDs of reduced reads to modified cls
89 modify_cls(cls_reduced_file, reads_in_cl_mod, cl)
90 os.unlink(reads_seqs_cl_orig)
91 os.unlink(reads_seqs_cl_reduced)
92 ## write the last line that was chcecked but not reduced
93 cls_reduced_file.write(line_cl_header)
94 cls_reduced_file.write(line)
95 modify_files = False
96 ## after reducing append the rest of clusters unchanged to the modified cls file
97 else:
98 cls_reduced_file.write(line_cl_header)
99 cls_reduced_file.write(line)
100 cls_reduced_file.close()
101 return reads_repre_dict
102
103
104 def modify_cls(cls_reduced_file, reads_in_cl_mod, cl):
105 ''' For each significant cluster write down the new adjusted names of reads
106 '''
107 num_of_reads = len(reads_in_cl_mod)
108 cls_reduced_file.write(">{}\t{}\n".format(cl, num_of_reads))
109 cls_reduced_file.write("{}\n".format("\t".join(reads_in_cl_mod)))
110
111
112 def get_read_sequences(READS_ALL, reads_dict, reads_files_list):
113 '''From file of ALL reads sequences take only the ones belonging to significant clusters.
114 Distribute them to separate files based on the cluster they belong to
115 '''
116 with open(READS_ALL, "r") as reads_all:
117 for line in reads_all:
118 if line.startswith(">"):
119 read = line.rstrip().lstrip(">")
120 ## check if the read belong to significant cluster
121 if read in reads_dict.keys():
122 ## then write it to file of reads for corresponding cluster
123 with open(reads_dict[read][1], "a") as reads_file:
124 reads_file.write(line)
125 reads_file.write(reads_all.readline())
126
127
128 def process_reads_groups(reads_seqs_cl_reduced, reads_repre_dict):
129 ''' Process the .clstr output of cd-hit which contains groups of original reads
130 Each group starts with > character, on separate lines are listed original reads IDs, the representative one is marked by *.
131 Get the number of reads in every group to preserve the quantitative information
132 Create dictionary of representative reads as keys and the amount of reads they represent as value:
133 value = 0 : read is not representative and will not take place in the reduce database
134 value > 0 : value indicates the number of reads it represents
135 Create list of new representative reads IDs encoding the original number of read in the group using 'reduce' tag:
136 e.g. 171freduce10 (10 original reads were reduced to one 171f representative)
137 '''
138 clstr_file = "{}.clstr".format(reads_seqs_cl_reduced)
139 reads_in_cl_mod = []
140 read_represent = ""
141 with open(clstr_file, "r") as clstr:
142 for line in clstr:
143 count_reads = 0
144 while not line.startswith(">"):
145 if not line:
146 break
147 read = line.split(" ")[1].split(".")[0]
148 count_reads += 1
149 if line.rstrip().endswith("*"):
150 read_represent = read
151 else:
152 reads_repre_dict[read] = 0
153 line = clstr.readline()
154 if read_represent:
155 reads_repre_dict[read_represent] = count_reads
156 reads_in_cl_mod.append("{}reduce{}".format(
157 read_represent.rstrip().lstrip(">"), count_reads))
158 return reads_repre_dict, reads_in_cl_mod
159
160
161 def reduce_reads(READS_ALL, READS_ALL_REDUCED, reads_repre_dict):
162 ''' Report a new file of reads sequences based on the original file of ALL reads using the reads representation dictionary.
163 Loop over the reads in the original READS_ALL file
164 There are 3 options evaluated for the read:
165 - the value in the dictionary equals to zero, read is not representative -> it will not take place in the new reads DB
166 - the value is greater than zero, the read is representative -> in new read DB encode the number of representing reads using 'reduce' tag (<Original_read_ID>reduce<number_represented>)
167 - the read is not in the dictionary -> add it unchanged from the original ALL reads database
168 '''
169 with open(READS_ALL_REDUCED, "w") as reads_all_red:
170 with open(READS_ALL, "r") as reads_all_ori:
171 for line in reads_all_ori:
172 if line.startswith(">"):
173 if line.rstrip() in reads_repre_dict:
174 amount_represented = reads_repre_dict[line.rstrip()]
175 if amount_represented > 0:
176 reads_all_red.write("{}reduce{}\n".format(
177 line.rstrip(), amount_represented))
178 reads_all_red.write(reads_all_ori.readline())
179 else:
180 reads_all_red.write(line)
181 reads_all_red.write(reads_all_ori.readline())
182
183
184 def main(args):
185 CLS_FILE = args.cls
186 READS_ALL = args.reads_all
187 CL_SIZE_TH = args.cluster_size
188 IDENTITY_TH = args.identity_th
189 CLS_REDUCED = args.cls_reduced
190 READS_ALL_REDUCED = args.reads_reduced
191
192 if not os.path.isabs(CLS_REDUCED):
193 CLS_REDUCED = os.path.join(os.getcwd(), CLS_REDUCED)
194
195 if not os.path.isabs(READS_ALL_REDUCED):
196 READS_ALL_REDUCED = os.path.join(os.getcwd(), READS_ALL_REDUCED)
197
198 reads_repre_dict = representative_reads(READS_ALL, CLS_FILE, CL_SIZE_TH,
199 CLS_REDUCED, IDENTITY_TH)
200 reduce_reads(READS_ALL, READS_ALL_REDUCED, reads_repre_dict)
201
202
203 if __name__ == '__main__':
204
205 # Command line arguments
206 parser = argparse.ArgumentParser()
207 parser.add_argument('-r',
208 '--reads_all',
209 type=str,
210 required=True,
211 help='input file containing all reads sequences')
212 parser.add_argument(
213 '-c',
214 '--cls',
215 type=str,
216 required=True,
217 help='input sorted cls file containing reads assigned to clusters')
218 parser.add_argument('-rr',
219 '--reads_reduced',
220 type=str,
221 default=configuration.READS_ALL_REDUCED,
222 help='output file containing reduced number of reads')
223 parser.add_argument(
224 '-cr',
225 '--cls_reduced',
226 type=str,
227 default=configuration.CLS_REDUCED,
228 help=
229 'output cls file containing adjusted clusters for the reduced reads database')
230 parser.add_argument('-i',
231 '--identity_th',
232 type=float,
233 default=0.90,
234 help='reads identity threshold for cdhit')
235 parser.add_argument('-cs',
236 '--cluster_size',
237 type=int,
238 default=1000,
239 help='minimum cluster size to be included in reducing')
240
241 args = parser.parse_args()
242 main(args)