Mercurial > repos > brinkmanlab > mauve_contig_mover
view stitch.py @ 0:c14690ec0c9f draft
"planemo upload for repository https://github.com/brinkmanlab/galaxy-tools/tree/master/mauve_contig_mover commit 33b02e08cbc8f76fb4b8537f8c968393f85a1b5e"
author | brinkmanlab |
---|---|
date | Fri, 24 Jan 2020 17:43:19 -0500 |
parents | |
children | 71831ead9e16 |
line wrap: on
line source
#!/usr/bin/env python import sys import csv import getopt from Bio import SeqIO, Alphabet from Bio.Seq import Seq usage = """ Mauve Contig Mover - Stitch Stitch contigs into a single contig. Compliments reversed sequences and rewrites all feature coordinates. Use: stitch.py [-v] [-s 'final sequence id'] <padding length> <draft file path> <draft file format> [MauveCM contigs.tab path] \t-v Print version and exit \t-s Provide an ID for the final sequence, the first sequence ID will be used otherwise Valid draft file formats: abi, abi-trim, ace, cif-atom, cif-seqres, clustal, embl, fasta, fasta-2line, fastq-sanger, fastq, fastq-solexa, fastq-illumina, genbank, gb, ig, imgt, nexus, pdb-seqres, pdb-atom, phd, phylip, pir, seqxml, sff, sff-trim, stockholm, swiss, tab, qual, uniprot-xml, gff3 """ def getOrder(path): """ Parse MCM contig order file and iterate rows after "Ordered Contigs" :param path: path to MCM *_contig.tab :return: tuple(type, label, contig_type, strand, left_end, right_end) """ with open(path, "r") as alignment_file: alignments = iter(csv.reader(alignment_file, delimiter="\t")) try: alignment = next(alignments) # Jog to beginning of ordered alignments while not (len(alignment) and "Ordered Contigs" == alignment[0]): alignment = next(alignments) # Skip column header next(alignments) while True: yield next(alignments) except StopIteration: return def stitch(pad, contigs, order): """ Reduce contigs to single contig by concatenation. Compliments reversed sequences and rewrites all feature coordinates. :param pad: Seq or SeqRecord instance to insert between contigs :param contigs: dict of SeqRecords keyed on the record name :param order: iterable of tuples containing sequence names and orientation :return: concatentated SeqRecord """ result = None # Concat in order with padding for alignment in order: if len(alignment) < 4: continue try: contig = contigs.pop(alignment[1]) # type: SeqIO.SeqRecord if alignment[3] == "complement": contig = contig.reverse_complement() if result: # A lot is happening in the background here. Biopython handles the feature coordinates implicitly. result += pad + contig else: result = contig pad.alphabet = result.seq.alphabet except KeyError: pass # Concat remaining in arbitrary order for unordered in contigs.values(): if result: result += pad + unordered else: result = unordered return result if __name__ == '__main__': seqid = None # Parse arguments try: opts, args = getopt.gnu_getopt(sys.argv[1:], 'vs:iq:') for opt, val in opts: if opt == '-v': print('1.0') exit(0) elif opt == '-s': seqid = val except getopt.GetoptError as err: print("Argument error(" + str(err.opt) + "): " + err.msg, file=sys.stderr) args = [] # Check for minimum number of arguments if len(args) < 3: print(usage, file=sys.stderr) exit(1) pad_len = int(args[0]) if pad_len < 0: print("Padding length must be >= 0", file=sys.stderr) print(help, file=sys.stderr) exit(1) draft_path = args[1] draft_format = args[2] if len(args) < 4: order = () else: order = getOrder(args[3]) pad = Seq('N'*pad_len) contigs = {seq.name: seq for seq in SeqIO.parse(draft_path, draft_format)} result = stitch(pad, contigs, order) if result: # Ensure there is only one 'source' feature # TODO pass if result and seqid: result.id = seqid result.description = "" seqlenlen = len(str(len(result))) if len(result.id) + 1 + seqlenlen > 28: # Genbank has a max length for the id and sequence length number, truncate the sequence id ov too long result.id = result.id[:27 - seqlenlen] result.seq.alphabet = Alphabet.generic_dna # TODO Investigate why this is required for some datasets SeqIO.write(result, sys.stdout, draft_format)