Mercurial > repos > brinkmanlab > make_unique_id
view make_unique_id.py @ 2:c8bda09480ae draft
"planemo upload for repository https://github.com/brinkmanlab/galaxy-tools/tree/master/make_unique_id commit 2e161ea2e4ddf5692f32ee389de42dc7c4fd5fa3"
author | brinkmanlab |
---|---|
date | Mon, 15 Jun 2020 19:04:53 -0400 |
parents | 061c3402a977 |
children | a2258ce2d58c |
line wrap: on
line source
#!/usr/bin/env python import sys from Bio import SeqIO from collections import defaultdict usage = """ make_unique_id Makes all record ids unique across all input data. All input data must be the same format. Use: make_unique_id.py [-v] <format> <input1> <output1> [<input2> <output2> ... <inputn> <outputn>] \t-v Print version and exit Valid formats: clustal, embl, fasta, fasta-2line, fastq-sanger, fastq, fastq-solexa, fastq-illumina, genbank, gb, imgt, nexus, phd, phylip, pir, seqxml, sff, stockholm, tab, qual """ if __name__ == '__main__': if '-v' in sys.argv: print('1.0') exit(0) if len(sys.argv) < 4: print("Missing arguments", file=sys.stderr) print(usage, file=sys.stderr) exit(1) format = sys.argv[1] ids = defaultdict(int) def makeUnique(seq): newid = seq.id[:16] # Genbank has a max length for the id and sequence length number, truncate the sequence id if too long count = ids[newid] ids[newid] += 1 if count: suffix = "_" + str(count) seqlenlen = len(str(len(seq))) if len(newid) + len(suffix) + 1 + seqlenlen > 16: newid = newid[:16 - seqlenlen - len(suffix)] newid += suffix seq.name += suffix if seq.id != newid: print(f"{seq.id}\t{newid}") seq.id = newid return seq paths = iter(sys.argv[2:]) for input, output in zip(paths, paths): SeqIO.write( map(makeUnique, SeqIO.parse(input, format)), output, format )