diff stitch.py @ 0:c14690ec0c9f draft

"planemo upload for repository https://github.com/brinkmanlab/galaxy-tools/tree/master/mauve_contig_mover commit 33b02e08cbc8f76fb4b8537f8c968393f85a1b5e"
author brinkmanlab
date Fri, 24 Jan 2020 17:43:19 -0500
parents
children 71831ead9e16
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/stitch.py	Fri Jan 24 17:43:19 2020 -0500
@@ -0,0 +1,137 @@
+#!/usr/bin/env python
+import sys
+import csv
+import getopt
+
+from Bio import SeqIO, Alphabet
+from Bio.Seq import Seq
+
+usage = """
+Mauve Contig Mover - Stitch
+Stitch contigs into a single contig.
+Compliments reversed sequences and rewrites all feature coordinates.
+
+Use: stitch.py [-v] [-s 'final sequence id'] <padding length> <draft file path> <draft file format> [MauveCM contigs.tab path]
+\t-v Print version and exit
+\t-s Provide an ID for the final sequence, the first sequence ID will be used otherwise
+Valid draft file formats:
+abi, abi-trim, ace, cif-atom, cif-seqres, clustal, embl, fasta, fasta-2line, fastq-sanger, fastq, fastq-solexa, fastq-illumina,
+genbank, gb, ig, imgt, nexus, pdb-seqres, pdb-atom, phd, phylip, pir, seqxml, sff, sff-trim, stockholm, swiss, tab, qual, uniprot-xml, gff3
+"""
+
+
+def getOrder(path):
+    """
+    Parse MCM contig order file and iterate rows after "Ordered Contigs"
+    :param path: path to MCM *_contig.tab
+    :return: tuple(type, label, contig_type, strand, left_end, right_end)
+    """
+    with open(path, "r") as alignment_file:
+        alignments = iter(csv.reader(alignment_file, delimiter="\t"))
+        try:
+            alignment = next(alignments)
+
+            # Jog to beginning of ordered alignments
+            while not (len(alignment) and "Ordered Contigs" == alignment[0]):
+                alignment = next(alignments)
+
+            # Skip column header
+            next(alignments)
+
+            while True:
+                yield next(alignments)
+        except StopIteration:
+            return
+
+
+def stitch(pad, contigs, order):
+    """
+    Reduce contigs to single contig by concatenation.
+    Compliments reversed sequences and rewrites all feature coordinates.
+    :param pad: Seq or SeqRecord instance to insert between contigs 
+    :param contigs: dict of SeqRecords keyed on the record name
+    :param order: iterable of tuples containing sequence names and orientation 
+    :return: concatentated SeqRecord
+    """
+    result = None
+    # Concat in order with padding
+    for alignment in order:
+        if len(alignment) < 4: continue
+        try:
+            contig = contigs.pop(alignment[1])  # type: SeqIO.SeqRecord
+            if alignment[3] == "complement":
+                contig = contig.reverse_complement()
+            if result:
+                # A lot is happening in the background here. Biopython handles the feature coordinates implicitly.
+                result += pad + contig
+            else:
+                result = contig
+                pad.alphabet = result.seq.alphabet
+        except KeyError:
+            pass
+
+    # Concat remaining in arbitrary order
+    for unordered in contigs.values():
+        if result:
+            result += pad + unordered
+        else:
+            result = unordered
+
+    return result
+
+
+if __name__ == '__main__':
+    seqid = None
+    # Parse arguments
+    try:
+        opts, args = getopt.gnu_getopt(sys.argv[1:], 'vs:iq:')
+        for opt, val in opts:
+            if opt == '-v':
+                print('1.0')
+                exit(0)
+            elif opt == '-s':
+                seqid = val
+    except getopt.GetoptError as err:
+        print("Argument error(" + str(err.opt) + "): " + err.msg, file=sys.stderr)
+        args = []
+    
+    # Check for minimum number of arguments
+    if len(args) < 3:
+        print(usage, file=sys.stderr)
+        exit(1)    
+
+    pad_len = int(args[0])
+    if pad_len < 0:
+        print("Padding length must be >= 0", file=sys.stderr)
+        print(help, file=sys.stderr)
+        exit(1)
+
+    draft_path = args[1]
+    draft_format = args[2]
+    
+    if len(args) < 4:
+        order = ()
+    else:
+        order = getOrder(args[3])
+        
+    pad = Seq('N'*pad_len)
+    contigs = {seq.name: seq for seq in SeqIO.parse(draft_path, draft_format)}
+
+    result = stitch(pad, contigs, order)
+
+    if result:
+        # Ensure there is only one 'source' feature
+        # TODO
+        pass
+
+    if result and seqid:
+        result.id = seqid
+        result.description = ""
+
+    seqlenlen = len(str(len(result)))
+    if len(result.id) + 1 + seqlenlen > 28:
+        # Genbank has a max length for the id and sequence length number, truncate the sequence id ov too long
+        result.id = result.id[:27 - seqlenlen]
+
+    result.seq.alphabet = Alphabet.generic_dna # TODO Investigate why this is required for some datasets
+    SeqIO.write(result, sys.stdout, draft_format)