Mercurial > repos > brinkmanlab > make_unique_id
comparison make_unique_id.py @ 0:a3a09dd8d09a draft
"planemo upload for repository https://github.com/brinkmanlab/galaxy-tools/tree/master/make_unique_id commit 33b02e08cbc8f76fb4b8537f8c968393f85a1b5e"
author | brinkmanlab |
---|---|
date | Fri, 24 Jan 2020 17:38:28 -0500 |
parents | |
children | 061c3402a977 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:a3a09dd8d09a |
---|---|
1 #!/usr/bin/env python | |
2 import sys | |
3 from Bio import SeqIO | |
4 from collections import defaultdict | |
5 | |
6 usage = """ | |
7 make_unique_id | |
8 Makes all record ids unique across all input data. | |
9 All input data must be the same format. | |
10 | |
11 Use: make_unique_id.py [-v] <format> <input1> <output1> [<input2> <output2> ... <inputn> <outputn>] | |
12 \t-v Print version and exit | |
13 | |
14 Valid formats: clustal, embl, fasta, fasta-2line, fastq-sanger, fastq, fastq-solexa, fastq-illumina, genbank, gb, imgt, | |
15 nexus, phd, phylip, pir, seqxml, sff, stockholm, tab, qual | |
16 """ | |
17 | |
18 if __name__ == '__main__': | |
19 if '-v' in sys.argv: | |
20 print('1.0') | |
21 exit(0) | |
22 | |
23 if len(sys.argv) < 4: | |
24 print("Missing arguments", file=sys.stderr) | |
25 print(usage, file=sys.stderr) | |
26 exit(1) | |
27 | |
28 format = sys.argv[1] | |
29 ids = defaultdict(int) | |
30 | |
31 def makeUnique(seq): | |
32 count = ids[seq.id] | |
33 ids[seq.id] += 1 | |
34 if count: | |
35 suffix = "_" + str(count) | |
36 newid = seq.id | |
37 seqlenlen = len(str(len(seq))) | |
38 if len(newid) + len(suffix) + 1 + seqlenlen > 28: | |
39 # Genbank has a max length for the id and sequence length number, truncate the sequence id if too long | |
40 newid = newid[:27 - seqlenlen - len(suffix)] | |
41 | |
42 print(f"{seq.id}\t{newid}{suffix}") | |
43 seq.id = newid + suffix | |
44 seq.name += suffix | |
45 | |
46 return seq | |
47 | |
48 | |
49 paths = iter(sys.argv[2:]) | |
50 | |
51 for input, output in zip(paths, paths): | |
52 SeqIO.write( | |
53 map(makeUnique, SeqIO.parse(input, format)), | |
54 output, | |
55 format | |
56 ) | |
57 | |
58 |