Mercurial > repos > thondeboer > neat_genreads
comparison utilities/validateFQ.py @ 0:6e75a84e9338 draft
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
author | thondeboer |
---|---|
date | Tue, 15 May 2018 02:39:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:6e75a84e9338 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 # | |
4 # A quickie tool for validating the correctness of a FASTQ file | |
5 # | |
6 # python validateFQ.py read1.fq [read2.fq] | |
7 # | |
8 | |
9 import sys | |
10 | |
11 def get4lines(fn): | |
12 l1 = fn.readline().strip() | |
13 l2 = fn.readline().strip() | |
14 l3 = fn.readline().strip() | |
15 l4 = fn.readline().strip() | |
16 if any([l1,l2,l3,l4]) and not all([l1,l2,l3,l4]): | |
17 print '\nError: missing lines:\n' | |
18 print l1+'\n'+l2+'\n'+l3+'\n'+l4+'\n' | |
19 exit(1) | |
20 return (l1,l2,l3,l4) | |
21 | |
22 ALLOWED_QUAL = '!\"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJ' | |
23 ALLOWED_NUCL = 'ACGTN' | |
24 | |
25 def validate4lines(l1,l2,l3,l4): | |
26 failed = 0 | |
27 # make sure lines contain correct delimiters | |
28 if l1[0] != '@' or l1[-2] != '/' or l3[0] != '+': | |
29 failed = 1 | |
30 # make sure seq len == qual length | |
31 if len(l2) != len(l4): | |
32 failed = 2 | |
33 # make sure seq string contains only valid characters | |
34 for n in l2: | |
35 if n not in ALLOWED_NUCL: | |
36 failed = 3 | |
37 # make sure qual string contains only valid characters | |
38 for n in l4: | |
39 if n not in ALLOWED_QUAL: | |
40 failed = 4 | |
41 if failed: | |
42 print '\nError: malformed lines:' | |
43 if failed == 1: print ' ---- invalid delimiters\n' | |
44 elif failed == 2: print ' ---- seq len != qual len\n' | |
45 elif failed == 3: print ' ---- seq contains invalid characters\n' | |
46 elif failed == 4: print ' ---- qual contains invalid characters\n' | |
47 print l1+'\n'+l2+'\n'+l3+'\n'+l4+'\n' | |
48 exit(1) | |
49 | |
50 f1 = open(sys.argv[1],'r') | |
51 (l1_r1, l2_r1, l3_r1, l4_r1) = get4lines(f1) | |
52 f2 = None | |
53 if len(sys.argv) == 3: | |
54 f2 = open(sys.argv[2],'r') | |
55 (l1_r2, l2_r2, l3_r2, l4_r2) = get4lines(f2) | |
56 | |
57 while l1_r1: | |
58 # check line syntax | |
59 validate4lines(l1_r1,l2_r1,l3_r1,l4_r1) | |
60 if f2 != None: | |
61 validate4lines(l1_r2,l2_r2,l3_r2,l4_r2) | |
62 # make sure seq id is same for r1/r2 | |
63 if l1_r1[:-1] != l1_r2[:-1]: | |
64 print '\nError: mismatched r1/r2 name:\n' | |
65 print l1_r1+'\n'+l1_r2+'\n' | |
66 exit(1) | |
67 | |
68 # grab next 4 lines... | |
69 (l1_r1, l2_r1, l3_r1, l4_r1) = get4lines(f1) | |
70 if f2 != None: | |
71 (l1_r2, l2_r2, l3_r2, l4_r2) = get4lines(f2) | |
72 | |
73 if f2 != None: | |
74 f2.close() | |
75 f1.close() | |
76 | |
77 print '\nPASSED WITH FLYING COLORS. GOOD DAY.\n' | |
78 |