comparison make_unique_id.py @ 0:a3a09dd8d09a draft

"planemo upload for repository https://github.com/brinkmanlab/galaxy-tools/tree/master/make_unique_id commit 33b02e08cbc8f76fb4b8537f8c968393f85a1b5e"
author brinkmanlab
date Fri, 24 Jan 2020 17:38:28 -0500
parents
children 061c3402a977
comparison
equal deleted inserted replaced
-1:000000000000 0:a3a09dd8d09a
1 #!/usr/bin/env python
2 import sys
3 from Bio import SeqIO
4 from collections import defaultdict
5
6 usage = """
7 make_unique_id
8 Makes all record ids unique across all input data.
9 All input data must be the same format.
10
11 Use: make_unique_id.py [-v] <format> <input1> <output1> [<input2> <output2> ... <inputn> <outputn>]
12 \t-v Print version and exit
13
14 Valid formats: clustal, embl, fasta, fasta-2line, fastq-sanger, fastq, fastq-solexa, fastq-illumina, genbank, gb, imgt,
15 nexus, phd, phylip, pir, seqxml, sff, stockholm, tab, qual
16 """
17
18 if __name__ == '__main__':
19 if '-v' in sys.argv:
20 print('1.0')
21 exit(0)
22
23 if len(sys.argv) < 4:
24 print("Missing arguments", file=sys.stderr)
25 print(usage, file=sys.stderr)
26 exit(1)
27
28 format = sys.argv[1]
29 ids = defaultdict(int)
30
31 def makeUnique(seq):
32 count = ids[seq.id]
33 ids[seq.id] += 1
34 if count:
35 suffix = "_" + str(count)
36 newid = seq.id
37 seqlenlen = len(str(len(seq)))
38 if len(newid) + len(suffix) + 1 + seqlenlen > 28:
39 # Genbank has a max length for the id and sequence length number, truncate the sequence id if too long
40 newid = newid[:27 - seqlenlen - len(suffix)]
41
42 print(f"{seq.id}\t{newid}{suffix}")
43 seq.id = newid + suffix
44 seq.name += suffix
45
46 return seq
47
48
49 paths = iter(sys.argv[2:])
50
51 for input, output in zip(paths, paths):
52 SeqIO.write(
53 map(makeUnique, SeqIO.parse(input, format)),
54 output,
55 format
56 )
57
58