Mercurial > repos > brinkmanlab > mauve_contig_mover
comparison stitch.py @ 0:c14690ec0c9f draft
"planemo upload for repository https://github.com/brinkmanlab/galaxy-tools/tree/master/mauve_contig_mover commit 33b02e08cbc8f76fb4b8537f8c968393f85a1b5e"
author | brinkmanlab |
---|---|
date | Fri, 24 Jan 2020 17:43:19 -0500 |
parents | |
children | 71831ead9e16 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:c14690ec0c9f |
---|---|
1 #!/usr/bin/env python | |
2 import sys | |
3 import csv | |
4 import getopt | |
5 | |
6 from Bio import SeqIO, Alphabet | |
7 from Bio.Seq import Seq | |
8 | |
9 usage = """ | |
10 Mauve Contig Mover - Stitch | |
11 Stitch contigs into a single contig. | |
12 Compliments reversed sequences and rewrites all feature coordinates. | |
13 | |
14 Use: stitch.py [-v] [-s 'final sequence id'] <padding length> <draft file path> <draft file format> [MauveCM contigs.tab path] | |
15 \t-v Print version and exit | |
16 \t-s Provide an ID for the final sequence, the first sequence ID will be used otherwise | |
17 Valid draft file formats: | |
18 abi, abi-trim, ace, cif-atom, cif-seqres, clustal, embl, fasta, fasta-2line, fastq-sanger, fastq, fastq-solexa, fastq-illumina, | |
19 genbank, gb, ig, imgt, nexus, pdb-seqres, pdb-atom, phd, phylip, pir, seqxml, sff, sff-trim, stockholm, swiss, tab, qual, uniprot-xml, gff3 | |
20 """ | |
21 | |
22 | |
23 def getOrder(path): | |
24 """ | |
25 Parse MCM contig order file and iterate rows after "Ordered Contigs" | |
26 :param path: path to MCM *_contig.tab | |
27 :return: tuple(type, label, contig_type, strand, left_end, right_end) | |
28 """ | |
29 with open(path, "r") as alignment_file: | |
30 alignments = iter(csv.reader(alignment_file, delimiter="\t")) | |
31 try: | |
32 alignment = next(alignments) | |
33 | |
34 # Jog to beginning of ordered alignments | |
35 while not (len(alignment) and "Ordered Contigs" == alignment[0]): | |
36 alignment = next(alignments) | |
37 | |
38 # Skip column header | |
39 next(alignments) | |
40 | |
41 while True: | |
42 yield next(alignments) | |
43 except StopIteration: | |
44 return | |
45 | |
46 | |
47 def stitch(pad, contigs, order): | |
48 """ | |
49 Reduce contigs to single contig by concatenation. | |
50 Compliments reversed sequences and rewrites all feature coordinates. | |
51 :param pad: Seq or SeqRecord instance to insert between contigs | |
52 :param contigs: dict of SeqRecords keyed on the record name | |
53 :param order: iterable of tuples containing sequence names and orientation | |
54 :return: concatentated SeqRecord | |
55 """ | |
56 result = None | |
57 # Concat in order with padding | |
58 for alignment in order: | |
59 if len(alignment) < 4: continue | |
60 try: | |
61 contig = contigs.pop(alignment[1]) # type: SeqIO.SeqRecord | |
62 if alignment[3] == "complement": | |
63 contig = contig.reverse_complement() | |
64 if result: | |
65 # A lot is happening in the background here. Biopython handles the feature coordinates implicitly. | |
66 result += pad + contig | |
67 else: | |
68 result = contig | |
69 pad.alphabet = result.seq.alphabet | |
70 except KeyError: | |
71 pass | |
72 | |
73 # Concat remaining in arbitrary order | |
74 for unordered in contigs.values(): | |
75 if result: | |
76 result += pad + unordered | |
77 else: | |
78 result = unordered | |
79 | |
80 return result | |
81 | |
82 | |
83 if __name__ == '__main__': | |
84 seqid = None | |
85 # Parse arguments | |
86 try: | |
87 opts, args = getopt.gnu_getopt(sys.argv[1:], 'vs:iq:') | |
88 for opt, val in opts: | |
89 if opt == '-v': | |
90 print('1.0') | |
91 exit(0) | |
92 elif opt == '-s': | |
93 seqid = val | |
94 except getopt.GetoptError as err: | |
95 print("Argument error(" + str(err.opt) + "): " + err.msg, file=sys.stderr) | |
96 args = [] | |
97 | |
98 # Check for minimum number of arguments | |
99 if len(args) < 3: | |
100 print(usage, file=sys.stderr) | |
101 exit(1) | |
102 | |
103 pad_len = int(args[0]) | |
104 if pad_len < 0: | |
105 print("Padding length must be >= 0", file=sys.stderr) | |
106 print(help, file=sys.stderr) | |
107 exit(1) | |
108 | |
109 draft_path = args[1] | |
110 draft_format = args[2] | |
111 | |
112 if len(args) < 4: | |
113 order = () | |
114 else: | |
115 order = getOrder(args[3]) | |
116 | |
117 pad = Seq('N'*pad_len) | |
118 contigs = {seq.name: seq for seq in SeqIO.parse(draft_path, draft_format)} | |
119 | |
120 result = stitch(pad, contigs, order) | |
121 | |
122 if result: | |
123 # Ensure there is only one 'source' feature | |
124 # TODO | |
125 pass | |
126 | |
127 if result and seqid: | |
128 result.id = seqid | |
129 result.description = "" | |
130 | |
131 seqlenlen = len(str(len(result))) | |
132 if len(result.id) + 1 + seqlenlen > 28: | |
133 # Genbank has a max length for the id and sequence length number, truncate the sequence id ov too long | |
134 result.id = result.id[:27 - seqlenlen] | |
135 | |
136 result.seq.alphabet = Alphabet.generic_dna # TODO Investigate why this is required for some datasets | |
137 SeqIO.write(result, sys.stdout, draft_format) |