Mercurial > repos > brinkmanlab > make_unique_id
changeset 0:a3a09dd8d09a draft
"planemo upload for repository https://github.com/brinkmanlab/galaxy-tools/tree/master/make_unique_id commit 33b02e08cbc8f76fb4b8537f8c968393f85a1b5e"
author | brinkmanlab |
---|---|
date | Fri, 24 Jan 2020 17:38:28 -0500 |
parents | |
children | 061c3402a977 |
files | make_unique_id.py make_unique_id.xml test-data/two_records.fastq |
diffstat | 3 files changed, 119 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/make_unique_id.py Fri Jan 24 17:38:28 2020 -0500 @@ -0,0 +1,58 @@ +#!/usr/bin/env python +import sys +from Bio import SeqIO +from collections import defaultdict + +usage = """ +make_unique_id +Makes all record ids unique across all input data. +All input data must be the same format. + +Use: make_unique_id.py [-v] <format> <input1> <output1> [<input2> <output2> ... <inputn> <outputn>] +\t-v Print version and exit + +Valid formats: clustal, embl, fasta, fasta-2line, fastq-sanger, fastq, fastq-solexa, fastq-illumina, genbank, gb, imgt, +nexus, phd, phylip, pir, seqxml, sff, stockholm, tab, qual +""" + +if __name__ == '__main__': + if '-v' in sys.argv: + print('1.0') + exit(0) + + if len(sys.argv) < 4: + print("Missing arguments", file=sys.stderr) + print(usage, file=sys.stderr) + exit(1) + + format = sys.argv[1] + ids = defaultdict(int) + + def makeUnique(seq): + count = ids[seq.id] + ids[seq.id] += 1 + if count: + suffix = "_" + str(count) + newid = seq.id + seqlenlen = len(str(len(seq))) + if len(newid) + len(suffix) + 1 + seqlenlen > 28: + # Genbank has a max length for the id and sequence length number, truncate the sequence id if too long + newid = newid[:27 - seqlenlen - len(suffix)] + + print(f"{seq.id}\t{newid}{suffix}") + seq.id = newid + suffix + seq.name += suffix + + return seq + + + paths = iter(sys.argv[2:]) + + for input, output in zip(paths, paths): + SeqIO.write( + map(makeUnique, SeqIO.parse(input, format)), + output, + format + ) + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/make_unique_id.xml Fri Jan 24 17:38:28 2020 -0500 @@ -0,0 +1,53 @@ +<tool id="make-unique-id" name="BioPython Make Unique ID" version="1.0" profile="16.04"> + <description>Makes all record ids unique across all input data</description> + <edam_topics> + <edam_topic>topic_3345</edam_topic> + <edam_topic>topic_3489</edam_topic> + <edam_topic>topic_0091</edam_topic> + </edam_topics> + <edam_operations> + <edam_operation>operation_3282</edam_operation> + </edam_operations> + <requirements> + <requirement type="package" version="3.7">python</requirement> + <requirement type="package" version="1.73">biopython</requirement> + </requirements> + <version_command><![CDATA[ python $__tool_directory__/make_unique_id.py -v ]]></version_command> + <command detect_errors="aggressive"><![CDATA[ + python $__tool_directory__/make_unique_id.py ${inputs[0].ext} + #for $input, $output in $zip($inputs, $outputs) + $input $output + #end for + ]]></command> + <inputs> + <param name="inputs" type="data_collection" format="clustal,embl,fasta,fasta-2line,fastq-sanger,fastq,fastq-solexa,fastq-illumina,genbank,gb,imgt,nexus,phd,phylip,pir,seqxml,sff,stockholm,tab,qual" label="Input" /> + </inputs> + <outputs> + <collection name="outputs" type="list" structured_like="inputs" inherit_format="true" /> + </outputs> + <tests> + <test expect_num_outputs="1"> + <param name="inputs" > + <collection type="list"> + <element name="test1" value="test-data/1.fastq" ftype="fastq" /> + <element name="test2" value="test-data/1.fastq" ftype="fastq" /> + <element name="test3" value="test-data/1.fastq" ftype="fastq" /> + </collection> + </param> + <output_collection name="outputs" type="list" count="3"> + <element name="test1" ftype="fastq" checksum="TODO" /> + <element name="test2" ftype="fastq" checksum="TODO" /> + <element name="test3" ftype="fastq" checksum="TODO" /> + </output_collection> + </test> + </tests> + <help><![CDATA[ + Ensure record IDs are unique across datasets. + Can read/write any formats supported by BioPython SeqIO. + + This is useful when aggregating data downstream and you want to ensure that there are no ID collisions. + ]]></help> + <citations> + <citation type="doi">10.5281/zenodo.3364789</citation> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/two_records.fastq Fri Jan 24 17:38:28 2020 -0500 @@ -0,0 +1,8 @@ +@HANNIBAL_1_FC302VTAAXX:2:1:228:167 +GAATTGATCAGGACATAGGACAACTGTAGGCACCAT ++HANNIBAL_1_FC302VTAAXX:2:1:228:167 +40 40 40 40 35 40 40 40 25 40 40 26 40 9 33 11 40 35 17 40 40 33 40 7 9 15 3 22 15 30 11 17 9 4 9 4 +@HANNIBAL_1_FC302VTAAXX:2:1:156:340 +GAGTTCTCGTCGCCTGTAGGCACCATCAATCGTATG ++HANNIBAL_1_FC302VTAAXX:2:1:156:340 +40 15 40 17 6 36 40 40 40 25 40 9 35 33 40 14 14 18 15 17 19 28 31 4 24 18 27 14 15 18 2 8 12 8 11 9 \ No newline at end of file