annotate profrep_db_reducing.py @ 5:ad3bbf392135 draft

Uploaded
author petr-novak
date Wed, 26 Jun 2019 11:14:05 -0400
parents a5f1638b73be
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1 #!/usr/bin/env python3
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
2
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
3 import argparse
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
4 import subprocess
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
5 import re
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
6 from tempfile import NamedTemporaryFile
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
7 import os
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
8 import configuration
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
9
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
10
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
11 def group_reads(reads_files_list, IDENTITY_TH):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
12 ''' Reduce the number of reads separately for each significant cluster based on similarities between them using cd-hit tool.
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
13 cd-hit produces reduced reads files containing only the representative reads.
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
14 '''
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
15 reads_seqs_cl_reduced_list = []
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
16 ## Run cd-hit on each cluster separately
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
17 for cluster_reads in reads_files_list:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
18 cl = cluster_reads.split("_")[-1]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
19 reads_seqs_cl_reduced = NamedTemporaryFile(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
20 suffix=".reduced_{}".format(cl),
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
21 delete=False)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
22 subprocess.call("cd-hit-est -i {} -o {} -c {} -M {}".format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
23 cluster_reads, reads_seqs_cl_reduced.name, IDENTITY_TH,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
24 configuration.MEM_LIM),
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
25 shell=True)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
26 reads_seqs_cl_reduced_list.append(reads_seqs_cl_reduced.name)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
27 reads_seqs_cl_reduced.close()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
28 ## Return the list of reduced reads files
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
29 return reads_seqs_cl_reduced_list
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
30
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
31
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
32 def representative_reads(READS_ALL, CLS_FILE, CL_SIZE_TH, CLS_REDUCED,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
33 IDENTITY_TH):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
34 ''' Group the reads based on the sequences similarities.
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
35 Replace a group by only the one representative read preserving the quantitative info how much reads it represents
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
36 1. Loop over the original cls file and find the significant clusters (min. number of reads)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
37 2. Get the reads which are in individual significant clusters
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
38 2. Get the reads sequences for individual clusters to run cd-hit which groups the reads for each cluster
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
39 3. After getting all significant ones (clusters sorted by size) process the outputs from cd-hit and to get reads representations
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
40 4. Create new cls file and write down significant clusters with the new reads IDs
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
41 5. Continue reading unsignificant original cls file and copy the rest of clusters to the new cls unchanged
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
42 Find groups of similar reads and replace them with only one representative also preserving the number of reads it represents
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
43 '''
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
44 reads_dict = {}
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
45 cl = None
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
46 line_cl_header = None
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
47 modify_files = True
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
48 reads_files_list = []
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
49 cls_reduced_file = open(CLS_REDUCED, "w")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
50 ## parse file of all clusters from RE
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
51 with open(CLS_FILE, "r") as cls_ori:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
52 for line in cls_ori:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
53 if line.startswith(">"):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
54 line_cl_header = line
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
55 ## cluster number
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
56 cl = re.split('\t| ', line)[0].rstrip().lstrip(">")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
57 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
58 reads_in_cl = line.rstrip().split("\t")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
59 ## reduce only reads in the biggest clusters:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
60 ## the threshold of cluster size is set as a minimum number of reads it has to contain
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
61 if len(reads_in_cl) >= CL_SIZE_TH:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
62 ## for significant cluster create a separate file to write reads sequences
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
63 reads_seqs_cl_orig = NamedTemporaryFile(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
64 suffix="_{}".format(cl),
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
65 delete=False)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
66 reads_files_list.append(reads_seqs_cl_orig.name)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
67 ## for every read in the cluster create entry in reads_dict to which cluster it belongs and the file of the read sequence for this cluster
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
68 for read in reads_in_cl:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
69 ## Dictionary of reads from significant clusters -> KEY:read_id VALUE:[number of cluster, filename to reads sequences file]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
70 reads_dict[read] = [cl, reads_seqs_cl_orig.name]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
71 reads_seqs_cl_orig.close()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
72 ## after getting all significant clusters to be reduced (original cls file sorted by size of clusters), process the reads reads in them and write to the modified reads and cls files
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
73 elif modify_files:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
74 ## get reads sequences for significant clusters from ALL reads
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
75 get_read_sequences(READS_ALL, reads_dict, reads_files_list)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
76 ## run cd-hit to reduce the reads for significant clusters
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
77 reads_seqs_cl_reduced_list = group_reads(reads_files_list,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
78 IDENTITY_TH)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
79 reads_repre_dict = {}
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
80 # for individual significant cluster, process the corresponding file of original and reduced reads
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
81 for reads_seqs_cl_orig, reads_seqs_cl_reduced in zip(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
82 reads_files_list, reads_seqs_cl_reduced_list):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
83 cl = reads_seqs_cl_reduced.split("_")[-1]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
84 ## get reads quantitative represantion dictionary for individual cluster
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
85 [reads_repre_dict, reads_in_cl_mod
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
86 ] = process_reads_groups(reads_seqs_cl_reduced,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
87 reads_repre_dict)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
88 ## for each significant cluster write the new IDs of reduced reads to modified cls
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
89 modify_cls(cls_reduced_file, reads_in_cl_mod, cl)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
90 os.unlink(reads_seqs_cl_orig)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
91 os.unlink(reads_seqs_cl_reduced)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
92 ## write the last line that was chcecked but not reduced
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
93 cls_reduced_file.write(line_cl_header)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
94 cls_reduced_file.write(line)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
95 modify_files = False
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
96 ## after reducing append the rest of clusters unchanged to the modified cls file
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
97 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
98 cls_reduced_file.write(line_cl_header)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
99 cls_reduced_file.write(line)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
100 cls_reduced_file.close()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
101 return reads_repre_dict
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
102
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
103
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
104 def modify_cls(cls_reduced_file, reads_in_cl_mod, cl):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
105 ''' For each significant cluster write down the new adjusted names of reads
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
106 '''
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
107 num_of_reads = len(reads_in_cl_mod)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
108 cls_reduced_file.write(">{}\t{}\n".format(cl, num_of_reads))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
109 cls_reduced_file.write("{}\n".format("\t".join(reads_in_cl_mod)))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
110
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
111
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
112 def get_read_sequences(READS_ALL, reads_dict, reads_files_list):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
113 '''From file of ALL reads sequences take only the ones belonging to significant clusters.
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
114 Distribute them to separate files based on the cluster they belong to
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
115 '''
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
116 with open(READS_ALL, "r") as reads_all:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
117 for line in reads_all:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
118 if line.startswith(">"):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
119 read = line.rstrip().lstrip(">")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
120 ## check if the read belong to significant cluster
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
121 if read in reads_dict.keys():
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
122 ## then write it to file of reads for corresponding cluster
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
123 with open(reads_dict[read][1], "a") as reads_file:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
124 reads_file.write(line)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
125 reads_file.write(reads_all.readline())
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
126
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
127
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
128 def process_reads_groups(reads_seqs_cl_reduced, reads_repre_dict):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
129 ''' Process the .clstr output of cd-hit which contains groups of original reads
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
130 Each group starts with > character, on separate lines are listed original reads IDs, the representative one is marked by *.
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
131 Get the number of reads in every group to preserve the quantitative information
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
132 Create dictionary of representative reads as keys and the amount of reads they represent as value:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
133 value = 0 : read is not representative and will not take place in the reduce database
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
134 value > 0 : value indicates the number of reads it represents
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
135 Create list of new representative reads IDs encoding the original number of read in the group using 'reduce' tag:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
136 e.g. 171freduce10 (10 original reads were reduced to one 171f representative)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
137 '''
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
138 clstr_file = "{}.clstr".format(reads_seqs_cl_reduced)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
139 reads_in_cl_mod = []
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
140 read_represent = ""
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
141 with open(clstr_file, "r") as clstr:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
142 for line in clstr:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
143 count_reads = 0
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
144 while not line.startswith(">"):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
145 if not line:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
146 break
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
147 read = line.split(" ")[1].split(".")[0]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
148 count_reads += 1
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
149 if line.rstrip().endswith("*"):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
150 read_represent = read
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
151 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
152 reads_repre_dict[read] = 0
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
153 line = clstr.readline()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
154 if read_represent:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
155 reads_repre_dict[read_represent] = count_reads
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
156 reads_in_cl_mod.append("{}reduce{}".format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
157 read_represent.rstrip().lstrip(">"), count_reads))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
158 return reads_repre_dict, reads_in_cl_mod
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
159
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
160
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
161 def reduce_reads(READS_ALL, READS_ALL_REDUCED, reads_repre_dict):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
162 ''' Report a new file of reads sequences based on the original file of ALL reads using the reads representation dictionary.
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
163 Loop over the reads in the original READS_ALL file
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
164 There are 3 options evaluated for the read:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
165 - the value in the dictionary equals to zero, read is not representative -> it will not take place in the new reads DB
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
166 - the value is greater than zero, the read is representative -> in new read DB encode the number of representing reads using 'reduce' tag (<Original_read_ID>reduce<number_represented>)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
167 - the read is not in the dictionary -> add it unchanged from the original ALL reads database
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
168 '''
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
169 with open(READS_ALL_REDUCED, "w") as reads_all_red:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
170 with open(READS_ALL, "r") as reads_all_ori:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
171 for line in reads_all_ori:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
172 if line.startswith(">"):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
173 if line.rstrip() in reads_repre_dict:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
174 amount_represented = reads_repre_dict[line.rstrip()]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
175 if amount_represented > 0:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
176 reads_all_red.write("{}reduce{}\n".format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
177 line.rstrip(), amount_represented))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
178 reads_all_red.write(reads_all_ori.readline())
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
179 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
180 reads_all_red.write(line)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
181 reads_all_red.write(reads_all_ori.readline())
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
182
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
183
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
184 def main(args):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
185 CLS_FILE = args.cls
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
186 READS_ALL = args.reads_all
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
187 CL_SIZE_TH = args.cluster_size
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
188 IDENTITY_TH = args.identity_th
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
189 CLS_REDUCED = args.cls_reduced
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
190 READS_ALL_REDUCED = args.reads_reduced
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
191
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
192 if not os.path.isabs(CLS_REDUCED):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
193 CLS_REDUCED = os.path.join(os.getcwd(), CLS_REDUCED)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
194
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
195 if not os.path.isabs(READS_ALL_REDUCED):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
196 READS_ALL_REDUCED = os.path.join(os.getcwd(), READS_ALL_REDUCED)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
197
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
198 reads_repre_dict = representative_reads(READS_ALL, CLS_FILE, CL_SIZE_TH,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
199 CLS_REDUCED, IDENTITY_TH)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
200 reduce_reads(READS_ALL, READS_ALL_REDUCED, reads_repre_dict)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
201
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
202
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
203 if __name__ == '__main__':
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
204
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
205 # Command line arguments
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
206 parser = argparse.ArgumentParser()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
207 parser.add_argument('-r',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
208 '--reads_all',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
209 type=str,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
210 required=True,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
211 help='input file containing all reads sequences')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
212 parser.add_argument(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
213 '-c',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
214 '--cls',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
215 type=str,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
216 required=True,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
217 help='input sorted cls file containing reads assigned to clusters')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
218 parser.add_argument('-rr',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
219 '--reads_reduced',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
220 type=str,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
221 default=configuration.READS_ALL_REDUCED,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
222 help='output file containing reduced number of reads')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
223 parser.add_argument(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
224 '-cr',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
225 '--cls_reduced',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
226 type=str,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
227 default=configuration.CLS_REDUCED,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
228 help=
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
229 'output cls file containing adjusted clusters for the reduced reads database')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
230 parser.add_argument('-i',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
231 '--identity_th',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
232 type=float,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
233 default=0.90,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
234 help='reads identity threshold for cdhit')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
235 parser.add_argument('-cs',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
236 '--cluster_size',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
237 type=int,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
238 default=1000,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
239 help='minimum cluster size to be included in reducing')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
240
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
241 args = parser.parse_args()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
242 main(args)