Mercurial > repos > thondeboer > neat_genreads
diff utilities/validateFQ.py @ 0:6e75a84e9338 draft
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
author | thondeboer |
---|---|
date | Tue, 15 May 2018 02:39:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/utilities/validateFQ.py Tue May 15 02:39:53 2018 -0400 @@ -0,0 +1,78 @@ +#!/usr/bin/env python + +# +# A quickie tool for validating the correctness of a FASTQ file +# +# python validateFQ.py read1.fq [read2.fq] +# + +import sys + +def get4lines(fn): + l1 = fn.readline().strip() + l2 = fn.readline().strip() + l3 = fn.readline().strip() + l4 = fn.readline().strip() + if any([l1,l2,l3,l4]) and not all([l1,l2,l3,l4]): + print '\nError: missing lines:\n' + print l1+'\n'+l2+'\n'+l3+'\n'+l4+'\n' + exit(1) + return (l1,l2,l3,l4) + +ALLOWED_QUAL = '!\"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJ' +ALLOWED_NUCL = 'ACGTN' + +def validate4lines(l1,l2,l3,l4): + failed = 0 + # make sure lines contain correct delimiters + if l1[0] != '@' or l1[-2] != '/' or l3[0] != '+': + failed = 1 + # make sure seq len == qual length + if len(l2) != len(l4): + failed = 2 + # make sure seq string contains only valid characters + for n in l2: + if n not in ALLOWED_NUCL: + failed = 3 + # make sure qual string contains only valid characters + for n in l4: + if n not in ALLOWED_QUAL: + failed = 4 + if failed: + print '\nError: malformed lines:' + if failed == 1: print ' ---- invalid delimiters\n' + elif failed == 2: print ' ---- seq len != qual len\n' + elif failed == 3: print ' ---- seq contains invalid characters\n' + elif failed == 4: print ' ---- qual contains invalid characters\n' + print l1+'\n'+l2+'\n'+l3+'\n'+l4+'\n' + exit(1) + +f1 = open(sys.argv[1],'r') +(l1_r1, l2_r1, l3_r1, l4_r1) = get4lines(f1) +f2 = None +if len(sys.argv) == 3: + f2 = open(sys.argv[2],'r') + (l1_r2, l2_r2, l3_r2, l4_r2) = get4lines(f2) + +while l1_r1: + # check line syntax + validate4lines(l1_r1,l2_r1,l3_r1,l4_r1) + if f2 != None: + validate4lines(l1_r2,l2_r2,l3_r2,l4_r2) + # make sure seq id is same for r1/r2 + if l1_r1[:-1] != l1_r2[:-1]: + print '\nError: mismatched r1/r2 name:\n' + print l1_r1+'\n'+l1_r2+'\n' + exit(1) + + # grab next 4 lines... + (l1_r1, l2_r1, l3_r1, l4_r1) = get4lines(f1) + if f2 != None: + (l1_r2, l2_r2, l3_r2, l4_r2) = get4lines(f2) + +if f2 != None: + f2.close() +f1.close() + +print '\nPASSED WITH FLYING COLORS. GOOD DAY.\n' +