Mercurial > repos > brinkmanlab > make_unique_id

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/make_unique_id.py	Fri Jan 24 17:38:28 2020 -0500
@@ -0,0 +1,58 @@
+#!/usr/bin/env python
+import sys
+from Bio import SeqIO
+from collections import defaultdict
+
+usage = """
+make_unique_id
+Makes all record ids unique across all input data.
+All input data must be the same format.
+
+Use: make_unique_id.py [-v] <format> <input1> <output1> [<input2> <output2> ... <inputn> <outputn>]
+\t-v Print version and exit
+
+Valid formats: clustal, embl, fasta, fasta-2line, fastq-sanger, fastq, fastq-solexa, fastq-illumina, genbank, gb, imgt,
+nexus, phd, phylip, pir, seqxml, sff, stockholm, tab, qual
+"""
+
+if __name__ == '__main__':
+    if '-v' in sys.argv:
+        print('1.0')
+        exit(0)
+
+    if len(sys.argv) < 4:
+        print("Missing arguments", file=sys.stderr)
+        print(usage, file=sys.stderr)
+        exit(1)
+
+    format = sys.argv[1]
+    ids = defaultdict(int)
+
+    def makeUnique(seq):
+        count = ids[seq.id]
+        ids[seq.id] += 1
+        if count:
+            suffix = "_" + str(count)
+            newid = seq.id
+            seqlenlen = len(str(len(seq)))
+            if len(newid) + len(suffix) + 1 + seqlenlen > 28:
+                # Genbank has a max length for the id and sequence length number, truncate the sequence id if too long
+               newid = newid[:27 - seqlenlen - len(suffix)]
+
+            print(f"{seq.id}\t{newid}{suffix}")
+            seq.id = newid + suffix
+            seq.name += suffix
+
+        return seq
+
+
+    paths = iter(sys.argv[2:])
+
+    for input, output in zip(paths, paths):
+        SeqIO.write(
+            map(makeUnique, SeqIO.parse(input, format)),
+            output,
+            format
+        )
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/make_unique_id.xml	Fri Jan 24 17:38:28 2020 -0500
@@ -0,0 +1,53 @@
+<tool id="make-unique-id" name="BioPython Make Unique ID" version="1.0" profile="16.04">
+    <description>Makes all record ids unique across all input data</description>
+    <edam_topics>
+        <edam_topic>topic_3345</edam_topic>
+        <edam_topic>topic_3489</edam_topic>
+        <edam_topic>topic_0091</edam_topic>
+    </edam_topics>
+    <edam_operations>
+        <edam_operation>operation_3282</edam_operation>
+    </edam_operations>
+    <requirements>
+        <requirement type="package" version="3.7">python</requirement>
+        <requirement type="package" version="1.73">biopython</requirement>
+    </requirements>
+    <version_command><![CDATA[ python $__tool_directory__/make_unique_id.py -v ]]></version_command>
+    <command detect_errors="aggressive"><![CDATA[
+        python $__tool_directory__/make_unique_id.py ${inputs[0].ext}
+        #for $input, $output in $zip($inputs, $outputs)
+            $input $output
+        #end for
+    ]]></command>
+    <inputs>
+        <param name="inputs" type="data_collection" format="clustal,embl,fasta,fasta-2line,fastq-sanger,fastq,fastq-solexa,fastq-illumina,genbank,gb,imgt,nexus,phd,phylip,pir,seqxml,sff,stockholm,tab,qual" label="Input" />
+    </inputs>
+    <outputs>
+        <collection name="outputs" type="list" structured_like="inputs" inherit_format="true" />
+    </outputs>
+    <tests>
+        <test expect_num_outputs="1">
+            <param name="inputs" >
+                <collection type="list">
+                    <element name="test1" value="test-data/1.fastq" ftype="fastq" />
+                    <element name="test2" value="test-data/1.fastq" ftype="fastq" />
+                    <element name="test3" value="test-data/1.fastq" ftype="fastq" />
+                </collection>
+            </param>
+            <output_collection name="outputs" type="list" count="3">
+                <element name="test1" ftype="fastq" checksum="TODO" />
+                <element name="test2" ftype="fastq" checksum="TODO" />
+                <element name="test3" ftype="fastq" checksum="TODO" />
+            </output_collection>
+        </test>
+    </tests>
+    <help><![CDATA[
+    Ensure record IDs are unique across datasets.
+    Can read/write any formats supported by BioPython SeqIO.
+
+    This is useful when aggregating data downstream and you want to ensure that there are no ID collisions.
+    ]]></help>
+    <citations>
+        <citation type="doi">10.5281/zenodo.3364789</citation>
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/two_records.fastq	Fri Jan 24 17:38:28 2020 -0500
@@ -0,0 +1,8 @@
+@HANNIBAL_1_FC302VTAAXX:2:1:228:167
+GAATTGATCAGGACATAGGACAACTGTAGGCACCAT
++HANNIBAL_1_FC302VTAAXX:2:1:228:167
+40 40 40 40 35 40 40 40 25 40 40 26 40 9 33 11 40 35 17 40 40 33 40 7 9 15 3 22 15 30 11 17 9 4 9 4
+@HANNIBAL_1_FC302VTAAXX:2:1:156:340
+GAGTTCTCGTCGCCTGTAGGCACCATCAATCGTATG
++HANNIBAL_1_FC302VTAAXX:2:1:156:340
+40 15 40 17 6 36 40 40 40 25 40 9 35 33 40 14 14 18 15 17 19 28 31 4 24 18 27 14 15 18 2 8 12 8 11 9
\ No newline at end of file