Mercurial > repos > brinkmanlab > mauve_contig_mover
diff stitch.py @ 0:c14690ec0c9f draft
"planemo upload for repository https://github.com/brinkmanlab/galaxy-tools/tree/master/mauve_contig_mover commit 33b02e08cbc8f76fb4b8537f8c968393f85a1b5e"
author | brinkmanlab |
---|---|
date | Fri, 24 Jan 2020 17:43:19 -0500 |
parents | |
children | 71831ead9e16 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/stitch.py Fri Jan 24 17:43:19 2020 -0500 @@ -0,0 +1,137 @@ +#!/usr/bin/env python +import sys +import csv +import getopt + +from Bio import SeqIO, Alphabet +from Bio.Seq import Seq + +usage = """ +Mauve Contig Mover - Stitch +Stitch contigs into a single contig. +Compliments reversed sequences and rewrites all feature coordinates. + +Use: stitch.py [-v] [-s 'final sequence id'] <padding length> <draft file path> <draft file format> [MauveCM contigs.tab path] +\t-v Print version and exit +\t-s Provide an ID for the final sequence, the first sequence ID will be used otherwise +Valid draft file formats: +abi, abi-trim, ace, cif-atom, cif-seqres, clustal, embl, fasta, fasta-2line, fastq-sanger, fastq, fastq-solexa, fastq-illumina, +genbank, gb, ig, imgt, nexus, pdb-seqres, pdb-atom, phd, phylip, pir, seqxml, sff, sff-trim, stockholm, swiss, tab, qual, uniprot-xml, gff3 +""" + + +def getOrder(path): + """ + Parse MCM contig order file and iterate rows after "Ordered Contigs" + :param path: path to MCM *_contig.tab + :return: tuple(type, label, contig_type, strand, left_end, right_end) + """ + with open(path, "r") as alignment_file: + alignments = iter(csv.reader(alignment_file, delimiter="\t")) + try: + alignment = next(alignments) + + # Jog to beginning of ordered alignments + while not (len(alignment) and "Ordered Contigs" == alignment[0]): + alignment = next(alignments) + + # Skip column header + next(alignments) + + while True: + yield next(alignments) + except StopIteration: + return + + +def stitch(pad, contigs, order): + """ + Reduce contigs to single contig by concatenation. + Compliments reversed sequences and rewrites all feature coordinates. + :param pad: Seq or SeqRecord instance to insert between contigs + :param contigs: dict of SeqRecords keyed on the record name + :param order: iterable of tuples containing sequence names and orientation + :return: concatentated SeqRecord + """ + result = None + # Concat in order with padding + for alignment in order: + if len(alignment) < 4: continue + try: + contig = contigs.pop(alignment[1]) # type: SeqIO.SeqRecord + if alignment[3] == "complement": + contig = contig.reverse_complement() + if result: + # A lot is happening in the background here. Biopython handles the feature coordinates implicitly. + result += pad + contig + else: + result = contig + pad.alphabet = result.seq.alphabet + except KeyError: + pass + + # Concat remaining in arbitrary order + for unordered in contigs.values(): + if result: + result += pad + unordered + else: + result = unordered + + return result + + +if __name__ == '__main__': + seqid = None + # Parse arguments + try: + opts, args = getopt.gnu_getopt(sys.argv[1:], 'vs:iq:') + for opt, val in opts: + if opt == '-v': + print('1.0') + exit(0) + elif opt == '-s': + seqid = val + except getopt.GetoptError as err: + print("Argument error(" + str(err.opt) + "): " + err.msg, file=sys.stderr) + args = [] + + # Check for minimum number of arguments + if len(args) < 3: + print(usage, file=sys.stderr) + exit(1) + + pad_len = int(args[0]) + if pad_len < 0: + print("Padding length must be >= 0", file=sys.stderr) + print(help, file=sys.stderr) + exit(1) + + draft_path = args[1] + draft_format = args[2] + + if len(args) < 4: + order = () + else: + order = getOrder(args[3]) + + pad = Seq('N'*pad_len) + contigs = {seq.name: seq for seq in SeqIO.parse(draft_path, draft_format)} + + result = stitch(pad, contigs, order) + + if result: + # Ensure there is only one 'source' feature + # TODO + pass + + if result and seqid: + result.id = seqid + result.description = "" + + seqlenlen = len(str(len(result))) + if len(result.id) + 1 + seqlenlen > 28: + # Genbank has a max length for the id and sequence length number, truncate the sequence id ov too long + result.id = result.id[:27 - seqlenlen] + + result.seq.alphabet = Alphabet.generic_dna # TODO Investigate why this is required for some datasets + SeqIO.write(result, sys.stdout, draft_format)