comparison utilities/validateFQ.py @ 0:6e75a84e9338 draft

planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
author thondeboer
date Tue, 15 May 2018 02:39:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:6e75a84e9338
1 #!/usr/bin/env python
2
3 #
4 # A quickie tool for validating the correctness of a FASTQ file
5 #
6 # python validateFQ.py read1.fq [read2.fq]
7 #
8
9 import sys
10
11 def get4lines(fn):
12 l1 = fn.readline().strip()
13 l2 = fn.readline().strip()
14 l3 = fn.readline().strip()
15 l4 = fn.readline().strip()
16 if any([l1,l2,l3,l4]) and not all([l1,l2,l3,l4]):
17 print '\nError: missing lines:\n'
18 print l1+'\n'+l2+'\n'+l3+'\n'+l4+'\n'
19 exit(1)
20 return (l1,l2,l3,l4)
21
22 ALLOWED_QUAL = '!\"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJ'
23 ALLOWED_NUCL = 'ACGTN'
24
25 def validate4lines(l1,l2,l3,l4):
26 failed = 0
27 # make sure lines contain correct delimiters
28 if l1[0] != '@' or l1[-2] != '/' or l3[0] != '+':
29 failed = 1
30 # make sure seq len == qual length
31 if len(l2) != len(l4):
32 failed = 2
33 # make sure seq string contains only valid characters
34 for n in l2:
35 if n not in ALLOWED_NUCL:
36 failed = 3
37 # make sure qual string contains only valid characters
38 for n in l4:
39 if n not in ALLOWED_QUAL:
40 failed = 4
41 if failed:
42 print '\nError: malformed lines:'
43 if failed == 1: print ' ---- invalid delimiters\n'
44 elif failed == 2: print ' ---- seq len != qual len\n'
45 elif failed == 3: print ' ---- seq contains invalid characters\n'
46 elif failed == 4: print ' ---- qual contains invalid characters\n'
47 print l1+'\n'+l2+'\n'+l3+'\n'+l4+'\n'
48 exit(1)
49
50 f1 = open(sys.argv[1],'r')
51 (l1_r1, l2_r1, l3_r1, l4_r1) = get4lines(f1)
52 f2 = None
53 if len(sys.argv) == 3:
54 f2 = open(sys.argv[2],'r')
55 (l1_r2, l2_r2, l3_r2, l4_r2) = get4lines(f2)
56
57 while l1_r1:
58 # check line syntax
59 validate4lines(l1_r1,l2_r1,l3_r1,l4_r1)
60 if f2 != None:
61 validate4lines(l1_r2,l2_r2,l3_r2,l4_r2)
62 # make sure seq id is same for r1/r2
63 if l1_r1[:-1] != l1_r2[:-1]:
64 print '\nError: mismatched r1/r2 name:\n'
65 print l1_r1+'\n'+l1_r2+'\n'
66 exit(1)
67
68 # grab next 4 lines...
69 (l1_r1, l2_r1, l3_r1, l4_r1) = get4lines(f1)
70 if f2 != None:
71 (l1_r2, l2_r2, l3_r2, l4_r2) = get4lines(f2)
72
73 if f2 != None:
74 f2.close()
75 f1.close()
76
77 print '\nPASSED WITH FLYING COLORS. GOOD DAY.\n'
78