Mercurial > repos > devteam > ncbi_blast_plus
view tools/ncbi_blast_plus/check_no_duplicates.py @ 26:2889433c7ae1 draft
v0.3.3 - fixed legacy dependecy definition
author | peterjc |
---|---|
date | Sat, 20 Jul 2019 18:36:36 -0400 |
parents | 31e517610e1f |
children |
line wrap: on
line source
#!/usr/bin/env python """Check for duplicate sequence identifiers in FASTA files. This is run as a pre-check before makeblastdb, in order to avoid a regression bug in BLAST+ 2.2.28 which fails to catch this. See: http://blastedbio.blogspot.co.uk/2012/10/my-ids-not-good-enough-for-ncbi-blast.html This script takes one or more FASTA filenames as input, and will return a non-zero error if any duplicate identifiers are found. """ import gzip import os import sys if "-v" in sys.argv or "--version" in sys.argv: print("v0.0.23") sys.exit(0) identifiers = set() files = 0 for filename in sys.argv[1:]: if not os.path.isfile(filename): sys.stderr.write("Missing FASTA file %r\n" % filename) sys.exit(2) files += 1 with open(filename, "rb") as binary_handle: magic = binary_handle.read(2) if not magic: # Empty file, special case continue elif magic == b"\x1f\x8b": # Gzipped handle = gzip.open(filename, "rt") elif magic[0:1] == b">": # Not gzipped, shoudl be plain FASTA handle = open(filename, "r") for line in handle: if line.startswith(">"): # The split will also take care of the new line character, # e.g. ">test\n" and ">test description here\n" both give "test" seq_id = line[1:].split(None, 1)[0] if seq_id in identifiers: handle.close() sys.exit("Repeated identifiers, e.g. %r" % seq_id) identifiers.add(seq_id) handle.close() if not files: sys.stderr.write("No FASTA files given to check for duplicates\n") sys.exit(3) elif files == 1: print("%i sequences" % len(identifiers)) else: print("%i sequences in %i FASTA files" % (len(identifiers), files))