Mercurial > repos > devteam > ncbi_blast_plus
comparison tools/ncbi_blast_plus/check_no_duplicates.py @ 11:4c4a0da938ff draft
Uploaded v0.0.22, now wraps BLAST+ 2.2.28 allowing extended tabular output to include the hit descriptions as column 25.
Supports $GALAXY_SLOTS.
Includes more tests and heavy use of macros.
author | peterjc |
---|---|
date | Thu, 05 Dec 2013 06:55:59 -0500 |
parents | |
children | 3034ce97dd33 |
comparison
equal
deleted
inserted
replaced
10:70e7dcbf6573 | 11:4c4a0da938ff |
---|---|
1 #!/usr/bin/env python | |
2 """Check for duplicate sequence identifiers in FASTA files. | |
3 | |
4 This is run as a pre-check before makeblastdb, in order to avoid | |
5 a regression bug in BLAST+ 2.2.28 which fails to catch this. See: | |
6 http://blastedbio.blogspot.co.uk/2012/10/my-ids-not-good-enough-for-ncbi-blast.html | |
7 | |
8 This script takes one or more FASTA filenames as input, and | |
9 will return a non-zero error if any duplicate identifiers | |
10 are found. | |
11 """ | |
12 import sys | |
13 import os | |
14 | |
15 if "-v" in sys.argv or "--version" in sys.argv: | |
16 print("v0.0.22") | |
17 sys.exit(0) | |
18 | |
19 def stop_err(msg, error=1): | |
20 sys.stderr.write("%s\n" % msg) | |
21 sys.exit(error) | |
22 | |
23 | |
24 identifiers = set() | |
25 files = 0 | |
26 for filename in sys.argv[1:]: | |
27 if not os.path.isfile(filename): | |
28 stop_err("Missing FASTA file %r" % filename, 2) | |
29 files += 1 | |
30 handle = open(filename) | |
31 for line in handle: | |
32 if line.startswith(">"): | |
33 #The split will also take care of the new line character, | |
34 #e.g. ">test\n" and ">test description here\n" both give "test" | |
35 seq_id = line[1:].split(None, 1)[0] | |
36 if seq_id in identifiers: | |
37 handle.close() | |
38 stop_err("Repeated identifiers, e.g. %r" % seq_id, 1) | |
39 identifiers.add(seq_id) | |
40 handle.close() | |
41 if not files: | |
42 stop_err("No FASTA files given to check for duplicates", 3) | |
43 elif files == 1: | |
44 print("%i sequences" % len(identifiers)) | |
45 else: | |
46 print("%i sequences in %i FASTA files" % (len(identifiers), files)) |