view utilities/validateFQ.py @ 10:7d10b55965c9 draft default tip

planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
author thondeboer
date Wed, 16 May 2018 17:02:51 -0400
parents 6e75a84e9338
children
line wrap: on
line source

#!/usr/bin/env python

#
#	A quickie tool for validating the correctness of a FASTQ file
#
#	python validateFQ.py read1.fq [read2.fq]
#

import sys

def get4lines(fn):
	l1 = fn.readline().strip()
	l2 = fn.readline().strip()
	l3 = fn.readline().strip()
	l4 = fn.readline().strip()
	if any([l1,l2,l3,l4]) and not all([l1,l2,l3,l4]):
		print '\nError: missing lines:\n'
		print l1+'\n'+l2+'\n'+l3+'\n'+l4+'\n'
		exit(1)
	return (l1,l2,l3,l4)

ALLOWED_QUAL = '!\"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJ'
ALLOWED_NUCL = 'ACGTN'

def validate4lines(l1,l2,l3,l4):
	failed = 0
	# make sure lines contain correct delimiters
	if l1[0] != '@' or l1[-2] != '/' or l3[0] != '+':
		failed = 1
	# make sure seq len == qual length
	if len(l2) != len(l4):
		failed = 2
	# make sure seq string contains only valid characters
	for n in l2:
		if n not in ALLOWED_NUCL:
			failed = 3
	# make sure qual string contains only valid characters
	for n in l4:
		if n not in ALLOWED_QUAL:
			failed = 4
	if failed:
		print '\nError: malformed lines:'
		if failed == 1: print ' ---- invalid delimiters\n'
		elif failed == 2: print ' ---- seq len != qual len\n'
		elif failed == 3: print ' ---- seq contains invalid characters\n'
		elif failed == 4: print ' ---- qual contains invalid characters\n'
		print l1+'\n'+l2+'\n'+l3+'\n'+l4+'\n'
		exit(1)

f1 = open(sys.argv[1],'r')
(l1_r1, l2_r1, l3_r1, l4_r1) = get4lines(f1)
f2 = None
if len(sys.argv) == 3:
	f2 = open(sys.argv[2],'r')
	(l1_r2, l2_r2, l3_r2, l4_r2) = get4lines(f2)

while l1_r1:
	# check line syntax
	validate4lines(l1_r1,l2_r1,l3_r1,l4_r1)
	if f2 != None:
		validate4lines(l1_r2,l2_r2,l3_r2,l4_r2)
		# make sure seq id is same for r1/r2
		if l1_r1[:-1] != l1_r2[:-1]:
			print '\nError: mismatched r1/r2 name:\n'
			print l1_r1+'\n'+l1_r2+'\n'
			exit(1)

	# grab next 4 lines...
	(l1_r1, l2_r1, l3_r1, l4_r1) = get4lines(f1)
	if f2 != None:
		(l1_r2, l2_r2, l3_r2, l4_r2) = get4lines(f2)

if f2 != None:
	f2.close()
f1.close()

print '\nPASSED WITH FLYING COLORS. GOOD DAY.\n'