comparison pal_filter.py @ 3:e1a14ed7a9d6 draft

Updated to version 0.02.04.4 (new pal_filter script)
author pjbriggs
date Wed, 24 Feb 2016 08:25:17 -0500
parents
children cb56cc1d5c39
comparison
equal deleted inserted replaced
2:b6ccc7dd7b02 3:e1a14ed7a9d6
1 #!/usr/bin/python -tt
2 #
3 # pal_filter
4 # https://github.com/graemefox/pal_filter
5 #
6 ################################################################################
7 # Graeme Fox - 15/02/2016 - graeme.fox@manchester.ac.uk
8 # Tested on 64-bit Ubuntu, with Python 2.7
9 #
10 ################################################################################
11 # PROGRAM DESCRIPTION
12 #
13 # Program to pick optimum loci from the output of pal_finder_v0.02.04
14 #
15 # This program can be used to filter output from pal_finder and choose the
16 # 'optimum' loci.
17 #
18 # For the paper referncing this workflow, see Griffiths et al.
19 # (unpublished as of 15/02/2016) (sarah.griffiths-5@postgrad.manchester.ac.uk)
20 #
21 ################################################################################
22 #
23 # This program also contains a quality-check method to improve the rate of PCR
24 # success. For this QC method, paired end reads are assembled using
25 # PANDAseq so you must have PANDAseq installed.
26 #
27 # For the paper referencing this assembly-QC method see Fox et al.
28 # (unpublished as of 15/02/2016) (graeme.fox@manchester.ac.uk)
29 #
30 # For best results in PCR for marker development, I suggest enabling all the
31 # filter options AND the assembly based QC
32 #
33 ################################################################################
34 # REQUIREMENTS
35 #
36 # Must have Biopython installed (www.biopython.org).
37 #
38 # If you with to perform the assembly QC step, you must have:
39 # PandaSeq (https://github.com/neufeld/pandaseq)
40 # PandaSeq must be in your $PATH / able to run from anywhere
41 ################################################################################
42 # REQUIRED OPTIONS
43 #
44 # -i forward_paired_ends.fastQ
45 # -j reverse_paired_ends.fastQ
46 # -p pal_finder output - the "(microsatellites with read IDs and
47 # primer pairs)" file
48 #
49 # By default the program does nothing....
50 #
51 # NON-REQUIRED OPTIONS
52 #
53 # -assembly: turn on the pandaseq assembly QC step
54 # -primers: filter microsatellite loci to just those which
55 # have primers designed
56 #
57 # -occurrences: filter microsatellite loci to those with primers
58 # which appear only once in the dataset
59 #
60 # -rankmotifs: filter microsatellite loci to just those with perfect motifs.
61 # Rank the output by size of motif (largest first)
62 #
63 ###########################################################
64 # For repeat analysis, the following extra non-required options may be useful:
65 #
66 # Since PandaSeq Assembly, and fastq -> fasta conversion are slow, do them the
67 # first time, generate the files and then skip either, or both steps with
68 # the following:
69 #
70 # -a: skip assembly step
71 # -c: skip fastq -> fasta conversion step
72 #
73 # Just make sure to keep the assembled/converted files in the correct directory
74 # with the correct filename(s)
75 ###########################################################
76 #
77 # EXAMPLE USAGE:
78 #
79 # pal_filtery.py -i R1.fastq -j R2.fastq
80 # -p pal_finder_output.tabular -primers -occurrences -rankmotifs -assembly
81 #
82 ###########################################################
83 import Bio, subprocess, argparse, csv, os, re, time
84 from Bio import SeqIO
85
86 # Get values for all the required and optional arguments
87
88 parser = argparse.ArgumentParser(description='pal_filter')
89 parser.add_argument('-i','--input1', help='Forward paired-end fastq file', \
90 required=True)
91
92 parser.add_argument('-j','--input2', help='Reverse paired-end fastq file', \
93 required=True)
94
95 parser.add_argument('-p','--pal_finder', help='Output from pal_finder ', \
96 required=True)
97
98 parser.add_argument('-assembly','--assembly_QC', help='Perform the PandaSeq \
99 based QC', nargs='?', const=1, type=int, required=False)
100
101 parser.add_argument('-a','--skip_assembly', help='If the assembly has already \
102 been run, skip it with -a', nargs='?', const=1, \
103 type=int, required=False)
104
105 parser.add_argument('-c','--skip_conversion', help='If the fastq to fasta \
106 conversion has already been run, skip it with -c', \
107 nargs='?', const=1, type=int, required=False)
108
109 parser.add_argument('-primers','--filter_by_primer_column', help='Filter \
110 pal_finder output to just those loci which have primers \
111 designed', nargs='?', const=1, type=int, required=False)
112
113 parser.add_argument('-occurrences','--filter_by_occurrences_column', \
114 help='Filter pal_finder output to just loci with primers \
115 which only occur once in the dataset', nargs='?', \
116 const=1, type=int, required=False)
117
118 parser.add_argument('-rankmotifs','--filter_and_rank_by_motif_size', \
119 help='Filter pal_finder output to just loci which are a \
120 perfect repeat unit. Also, rank the loci by motif size \
121 (largest first)', nargs='?', const=1, type=int, \
122 required=False)
123
124 args = vars(parser.parse_args())
125
126 # parse arguments
127
128 R1_input = args['input1'];
129 R2_input = args['input2'];
130 pal_finder_output = args['pal_finder'];
131 perform_assembly = args['assembly_QC']
132 skip_assembly = args['skip_assembly'];
133 skip_conversion = args['skip_conversion'];
134 filter_primers = args['filter_by_primer_column'];
135 filter_occurrences = args['filter_by_occurrences_column'];
136 filter_rank_motifs = args['filter_and_rank_by_motif_size'];
137
138 # set default values for arguments
139 if perform_assembly is None:
140 perform_assembly = 0
141 if skip_assembly is None:
142 skip_assembly = 0
143 if skip_conversion is None:
144 skip_conversion = 0
145 if filter_primers is None:
146 filter_primers = 0
147 if filter_occurrences is None:
148 filter_occurrences = 0
149 if filter_rank_motifs is None:
150 filter_rank_motifs = 0
151
152 ############################################################
153 # Function List #
154 ############################################################
155
156 # Reverse complement a sequence
157
158 def ReverseComplement1(seq):
159 seq_dict = {'A':'T','T':'A','G':'C','C':'G'}
160 return "".join([seq_dict[base] for base in reversed(seq)])
161
162 # Convert a .fastq to a .fasta, filter to just lines we want
163 # and strip MiSeq barcodes
164
165 def fastq_to_fasta(file):
166 file_name = os.path.splitext(os.path.basename(file))[0]
167 with open(file_name + "_filtered.fasta", "w") as out:
168 for record in SeqIO.parse(file, "fastq"):
169 ID = str(record.id)
170 SEQ = str(record.seq)
171 if ID in wanted:
172 out.write(">" + ID + "\n" + SEQ + "\n")
173
174 # strip the miseq barcodes from a fasta file
175
176 def strip_barcodes(file):
177 file_name = os.path.splitext(os.path.basename(file))[0]
178 with open(file_name + "_adapters_removed.fasta", "w") as out:
179 for record in SeqIO.parse(file, "fasta"):
180 match = re.search(r'\S*:', record.id)
181 if match:
182 correct = match.group().rstrip(":")
183 else:
184 correct = str(record.id)
185 SEQ = str(record.seq)
186 if correct in wanted:
187 out.write(">" + correct + "\n" + SEQ + "\n")
188
189 ############################################################
190 # MAIN PROGRAM #
191 ############################################################
192 print "\n~~~~~~~~~~"
193 print "pal_filter"
194 print "~~~~~~~~~~\n"
195 time.sleep(1)
196 print "\"Find the optimum loci in your pal_finder output and increase "\
197 "the rate of successful microsatellite marker development\""
198 print "\nSee Griffiths et al. (currently unpublished) for more details......"
199 time.sleep(2)
200
201 if (perform_assembly == 0 and filter_primers == 0 and filter_occurrences == 0 \
202 and filter_rank_motifs == 0):
203 print "\nNo optional arguments supplied."
204 print "\nBy default this program does nothing."
205 print "\nNo files produced and no modifications made."
206 print "\nFinished.\n"
207 exit()
208
209 else:
210 print "\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
211 print "Checking supplied filtering parameters:"
212 print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
213 time.sleep(2)
214 if filter_primers == 1:
215 print "-primers flag supplied."
216 print "Filtering pal_finder output on the \"Primers found (1=y,0=n)\"" \
217 " column."
218 print "Only rows where primers have successfully been designed will"\
219 " pass the filter.\n"
220 time.sleep(2)
221 if filter_occurrences == 1:
222 print "-occurrences flag supplied."
223 print "Filtering pal_finder output on the \"Occurences of Forward" \
224 " Primer in Reads\" and \"Occurences of Reverse Primer" \
225 " in Reads\" columns."
226 print "Only rows where both primers occur only a single time in the"\
227 " reads pass the filter.\n"
228 time.sleep(2)
229 if filter_rank_motifs == 1:
230 print "-rankmotifs flag supplied."
231 print "Filtering pal_finder output on the \"Motifs(bases)\" column to" \
232 " just those with perfect repeats."
233 print "Only rows containing 'perfect' repeats will pass the filter."
234 print "Also, ranking output by size of motif (largest first).\n"
235 time.sleep(2)
236 # index the raw fastq files so that the sequences can be pulled out and
237 # added to the filtered output file
238
239 # Indexing input sequence files
240 print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
241 print "Indexing FastQ files....."
242 print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
243 R1fastq_sequences_index = SeqIO.index(R1_input,'fastq')
244 R2fastq_sequences_index = SeqIO.index(R2_input,'fastq')
245 print "Indexing complete."
246
247 # create a set to hold the filtered output
248 wanted_lines = set()
249
250 # get lines from the pal_finder output which meet filter settings
251
252 # read the pal_finder output file into a csv reader
253 with open (pal_finder_output) as csvfile_infile:
254 csv_f = csv.reader(csvfile_infile, delimiter='\t')
255 header = csv_f.next()
256 with open(os.path.splitext(os.path.basename(pal_finder_output))[0] + \
257 ".filtered", 'w') as csvfile_outfile:
258 # write the header line for the output file
259 csvfile_outfile.write('\t'.join(header) + "\tR1_Sequence_ID\t"
260 "R1_Sequence\tR2_Sequence_ID\tR2_Sequence\n")
261 for row in csv_f:
262 # get the sequence ID
263 seq_ID = row[0]
264 # get the raw sequence reads and convert to a format that can
265 # go into a tsv file
266 R1_sequence = R1fastq_sequences_index[seq_ID].format("fasta").\
267 replace("\n","\t",1).replace("\n","")
268 R2_sequence = R2fastq_sequences_index[seq_ID].format("fasta").\
269 replace("\n","\t",1).replace("\n","")
270 seq_info = "\t" + R1_sequence + "\t" + R2_sequence + "\n"
271 # navigate through all different combinations of filter options
272 # if the primer filter is switched on
273 if filter_primers == 1:
274 # check the occurrences of primers field
275 if row[5] == "1":
276 # if filter occurrences of primers is switched on
277 if filter_occurrences == 1:
278 # check the occurrences of primers field
279 if (row[15] == "1" and row[16] == "1"):
280 # if rank by motif is switched on
281 if filter_rank_motifs == 1:
282 # check for perfect motifs
283 if row[1].count('(') == 1:
284 # all 3 filter switched on
285 # write line out to output
286 csvfile_outfile.write('\t'.join(row) + \
287 seq_info)
288
289 else:
290 csvfile_outfile.write('\t'.join(row) + seq_info)
291 elif filter_rank_motifs == 1:
292 if row[1].count('(') == 1:
293 csvfile_outfile.write('\t'.join(row) + seq_info)
294 else:
295 csvfile_outfile.write('\t'.join(row) + seq_info)
296 elif filter_occurrences == 1:
297 if (row[15] == "1" and row[16] == "1"):
298 if filter_rank_motifs == 1:
299 if row[1].count('(') == 1:
300 csvfile_outfile.write('\t'.join(row) + seq_info)
301 else:
302 csvfile_outfile.write('\t'.join(row) + seq_info)
303 elif filter_rank_motifs == 1:
304 if row[1].count('(') == 1:
305 csvfile_outfile.write('\t'.join(row) + seq_info)
306 else:
307 csvfile_outfile.write('\t'.join(row) + seq_info)
308
309 # if filter_rank_motifs is active, order the file by the size of the motif
310 if filter_rank_motifs == 1:
311 rank_motif = []
312 ranked_list = []
313 # read in the non-ordered file and add every entry to rank_motif list
314 with open(os.path.splitext(os.path.basename(pal_finder_output))[0] + \
315 ".filtered") as csvfile_ranksize:
316 csv_rank = csv.reader(csvfile_ranksize, delimiter='\t')
317 header = csv_rank.next()
318 for line in csv_rank:
319 rank_motif.append(line)
320
321 # open the file ready to write the ordered list
322 with open(os.path.splitext(os.path.basename(pal_finder_output))[0] + \
323 ".filtered", 'w') as rank_outfile:
324 rankwriter = csv.writer(rank_outfile, delimiter='\t', \
325 lineterminator='\n')
326 rankwriter.writerow(header)
327 count = 2
328 while count < 10:
329 for row in rank_motif:
330 # count size of motif
331 motif = re.search(r'[ATCG]*',row[1])
332 if motif:
333 the_motif = motif.group()
334 # rank it and write into ranked_list
335 if len(the_motif) == count:
336 ranked_list.insert(0, row)
337 count = count + 1
338 # write out the ordered list, into the .filtered file
339 for row in ranked_list:
340 rankwriter.writerow(row)
341
342 print "\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
343 print "Checking assembly flags supplied:"
344 print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
345 if perform_assembly == 0:
346 print "Assembly flag not supplied. Not performing assembly QC.\n"
347 if perform_assembly == 1:
348 print "-assembly flag supplied: Performing PandaSeq assembly quality checks."
349 print "See Fox et al. (currently unpublished) for full details on the"\
350 " quality-check process.\n"
351 time.sleep(5)
352
353 # Get readID, F primers, R primers and motifs from filtered pal_finder output
354 seqIDs = []
355 motif = []
356 F_primers = []
357 R_primers = []
358 with open(os.path.splitext(os.path.basename(pal_finder_output))[0] + \
359 ".filtered") as input_csv:
360 pal_finder_csv = csv.reader(input_csv, delimiter='\t')
361 header = pal_finder_csv.next()
362 for row in pal_finder_csv:
363 seqIDs.append(row[0])
364 motif.append(row[1])
365 F_primers.append(row[7])
366 R_primers.append(row[9])
367
368 # Get a list of just the unique IDs we want
369 wanted = set()
370 for line in seqIDs:
371 wanted.add(line)
372
373 # Assemble the paired end reads into overlapping contigs using PandaSeq
374 # (can be skipped with the -a flag if assembly has already been run
375 # and the appropriate files are in the same directory as the script,
376 # and named "Assembly.fasta" and "Assembly_adapters_removed.fasta")
377 #
378 # The first time you riun the script you MUST not enable the -a flag.t
379 # but you are able to skip the assembly in subsequent analysis using the
380 # -a flag.
381
382 if skip_assembly == 0:
383 pandaseq_command = 'pandaseq -A pear -f ' + R1_input + ' -r ' + \
384 R2_input + ' -o 25 -t 0.95 -w Assembly.fasta'
385 subprocess.call(pandaseq_command, shell=True)
386 strip_barcodes("Assembly.fasta")
387 print "\nPaired end reads been assembled into overlapping reads."
388 print "\nFor future analysis, you can skip this assembly step using \
389 the -a flag, provided that the assembly.fasta file is \
390 intact and in the same location."
391 else:
392 print "\n(Skipping the assembly step as you provided the -a flag)"
393
394 # Fastq files need to be converted to fasta. The first time you run the script
395 # you MUST not enable the -c flag, but you are able to skip the conversion
396 # later using the -c flag. Make sure the fasta files are in the same location
397 #and do not change the filenames
398
399 if skip_conversion ==0:
400 fastq_to_fasta(R1_input)
401 fastq_to_fasta(R2_input)
402 print "\nThe input fastq files have been converted to the fasta format."
403 print "\nFor any future analysis, you can skip this conversion step \
404 using the -c flag, provided that the fasta files \
405 are intact and in the same location."
406 else:
407 print "\n(Skipping the fastq -> fasta conversion as you provided the" \
408 " -c flag).\n"
409
410 # get the files and everything else we will need
411
412 # Assembled fasta file
413 assembly_file = "Assembly_adapters_removed.fasta"
414 # filtered R1 reads
415 R1_fasta = os.path.splitext(os.path.basename(R1_input))[0] + \
416 "_filtered.fasta"
417 # filtered R2 reads
418 R2_fasta = os.path.splitext(os.path.basename(R2_input))[0] + \
419 "_filtered.fasta"
420
421 outputfilename = os.path.splitext(os.path.basename(R1_input))[0]
422
423 # parse the files with SeqIO
424 assembly_sequences = SeqIO.parse(assembly_file,'fasta')
425 R1fasta_sequences = SeqIO.parse(R1_fasta,'fasta')
426
427 # create some empty lists to hold the ID tags we are interested in
428 assembly_IDs = []
429 fasta_IDs = []
430
431 # populate the above lists with sequence IDs
432 for sequence in assembly_sequences:
433 assembly_IDs.append(sequence.id)
434 for sequence in R1fasta_sequences:
435 fasta_IDs.append(sequence.id)
436
437 # Index the assembly fasta file
438 assembly_sequences_index = SeqIO.index(assembly_file,'fasta')
439 R1fasta_sequences_index = SeqIO.index(R1_fasta,'fasta')
440 R2fasta_sequences_index = SeqIO.index(R2_fasta,'fasta')
441
442 # prepare the output file
443 with open (outputfilename + "_pal_filter_assembly_output.txt", 'w') \
444 as outputfile:
445 # write the headers for the output file
446 outputfile.write("readPairID\t Forward Primer\t F Primer Position in "
447 "Assembled Read\t Reverse Primer\t R Primer Position in "
448 "Assembled Read\t Motifs(bases)\t Assembled Read ID\t "
449 "Assembled Read Sequence\t Raw Forward Read ID\t Raw "
450 "Forward Read Sequence\t Raw Reverse Read ID\t Raw Reverse "
451 "Read Sequence\n")
452
453 # cycle through parameters from the pal_finder output
454 for x, y, z, a in zip(seqIDs, F_primers, R_primers, motif):
455 if str(x) in assembly_IDs:
456 # get the raw sequences ready to go into the output file
457 assembly_seq = (assembly_sequences_index.get_raw(x).decode())
458 # fasta entries need to be converted to single line so sit nicely in the output
459 assembly_output = assembly_seq.replace("\n","\t")
460 R1_fasta_seq = (R1fasta_sequences_index.get_raw(x).decode())
461 R1_output = R1_fasta_seq.replace("\n","\t",1).replace("\n","")
462 R2_fasta_seq = (R2fasta_sequences_index.get_raw(x).decode())
463 R2_output = R2_fasta_seq.replace("\n","\t",1).replace("\n","")
464 assembly_no_id = '\n'.join(assembly_seq.split('\n')[1:])
465
466 # check that both primer sequences can be seen in the assembled contig
467 if y or ReverseComplement1(y) in assembly_no_id and z or \
468 ReverseComplement1(z) in assembly_no_id:
469 if y in assembly_no_id:
470 # get the positions of the primers in the assembly to predict fragment length
471 F_position = assembly_no_id.index(y)+len(y)+1
472 if ReverseComplement1(y) in assembly_no_id:
473 F_position = assembly_no_id.index(ReverseComplement1(y)\
474 )+len(ReverseComplement1(y))+1
475 if z in assembly_no_id:
476 R_position = assembly_no_id.index(z)+1
477 if ReverseComplement1(z) in assembly_no_id:
478 R_position = assembly_no_id.index(ReverseComplement1(z)\
479 )+1
480
481 # write everything out into the output file
482 output = (str(x) + "\t" + y + "\t" + str(F_position) \
483 + "\t" + (z) + "\t" + str(R_position) \
484 + "\t" + a + "\t" + assembly_output \
485 + R1_output + "\t" + R2_output + "\n")
486 outputfile.write(output)
487 print "\nPANDAseq quality check complete."
488 print "Results from PANDAseq quality check (and filtering, if any" \
489 " any filters enabled) written to output file" \
490 " ending \"_pal_filter_assembly_output.txt\".\n\n"
491
492 print "Filtering of pal_finder results complete."
493 print "Filtered results written to output file ending \".filtered\"."
494 print "\nFinished\n"
495 else:
496 if (skip_assembly == 1 or skip_conversion == 1):
497 print "\nERROR: You cannot supply the -a flag or the -c flag without \
498 also supplying the -assembly flag.\n"
499
500 print "\nProgram Finished\n"