comparison stitch.py @ 0:c14690ec0c9f draft

"planemo upload for repository https://github.com/brinkmanlab/galaxy-tools/tree/master/mauve_contig_mover commit 33b02e08cbc8f76fb4b8537f8c968393f85a1b5e"
author brinkmanlab
date Fri, 24 Jan 2020 17:43:19 -0500
parents
children 71831ead9e16
comparison
equal deleted inserted replaced
-1:000000000000 0:c14690ec0c9f
1 #!/usr/bin/env python
2 import sys
3 import csv
4 import getopt
5
6 from Bio import SeqIO, Alphabet
7 from Bio.Seq import Seq
8
9 usage = """
10 Mauve Contig Mover - Stitch
11 Stitch contigs into a single contig.
12 Compliments reversed sequences and rewrites all feature coordinates.
13
14 Use: stitch.py [-v] [-s 'final sequence id'] <padding length> <draft file path> <draft file format> [MauveCM contigs.tab path]
15 \t-v Print version and exit
16 \t-s Provide an ID for the final sequence, the first sequence ID will be used otherwise
17 Valid draft file formats:
18 abi, abi-trim, ace, cif-atom, cif-seqres, clustal, embl, fasta, fasta-2line, fastq-sanger, fastq, fastq-solexa, fastq-illumina,
19 genbank, gb, ig, imgt, nexus, pdb-seqres, pdb-atom, phd, phylip, pir, seqxml, sff, sff-trim, stockholm, swiss, tab, qual, uniprot-xml, gff3
20 """
21
22
23 def getOrder(path):
24 """
25 Parse MCM contig order file and iterate rows after "Ordered Contigs"
26 :param path: path to MCM *_contig.tab
27 :return: tuple(type, label, contig_type, strand, left_end, right_end)
28 """
29 with open(path, "r") as alignment_file:
30 alignments = iter(csv.reader(alignment_file, delimiter="\t"))
31 try:
32 alignment = next(alignments)
33
34 # Jog to beginning of ordered alignments
35 while not (len(alignment) and "Ordered Contigs" == alignment[0]):
36 alignment = next(alignments)
37
38 # Skip column header
39 next(alignments)
40
41 while True:
42 yield next(alignments)
43 except StopIteration:
44 return
45
46
47 def stitch(pad, contigs, order):
48 """
49 Reduce contigs to single contig by concatenation.
50 Compliments reversed sequences and rewrites all feature coordinates.
51 :param pad: Seq or SeqRecord instance to insert between contigs
52 :param contigs: dict of SeqRecords keyed on the record name
53 :param order: iterable of tuples containing sequence names and orientation
54 :return: concatentated SeqRecord
55 """
56 result = None
57 # Concat in order with padding
58 for alignment in order:
59 if len(alignment) < 4: continue
60 try:
61 contig = contigs.pop(alignment[1]) # type: SeqIO.SeqRecord
62 if alignment[3] == "complement":
63 contig = contig.reverse_complement()
64 if result:
65 # A lot is happening in the background here. Biopython handles the feature coordinates implicitly.
66 result += pad + contig
67 else:
68 result = contig
69 pad.alphabet = result.seq.alphabet
70 except KeyError:
71 pass
72
73 # Concat remaining in arbitrary order
74 for unordered in contigs.values():
75 if result:
76 result += pad + unordered
77 else:
78 result = unordered
79
80 return result
81
82
83 if __name__ == '__main__':
84 seqid = None
85 # Parse arguments
86 try:
87 opts, args = getopt.gnu_getopt(sys.argv[1:], 'vs:iq:')
88 for opt, val in opts:
89 if opt == '-v':
90 print('1.0')
91 exit(0)
92 elif opt == '-s':
93 seqid = val
94 except getopt.GetoptError as err:
95 print("Argument error(" + str(err.opt) + "): " + err.msg, file=sys.stderr)
96 args = []
97
98 # Check for minimum number of arguments
99 if len(args) < 3:
100 print(usage, file=sys.stderr)
101 exit(1)
102
103 pad_len = int(args[0])
104 if pad_len < 0:
105 print("Padding length must be >= 0", file=sys.stderr)
106 print(help, file=sys.stderr)
107 exit(1)
108
109 draft_path = args[1]
110 draft_format = args[2]
111
112 if len(args) < 4:
113 order = ()
114 else:
115 order = getOrder(args[3])
116
117 pad = Seq('N'*pad_len)
118 contigs = {seq.name: seq for seq in SeqIO.parse(draft_path, draft_format)}
119
120 result = stitch(pad, contigs, order)
121
122 if result:
123 # Ensure there is only one 'source' feature
124 # TODO
125 pass
126
127 if result and seqid:
128 result.id = seqid
129 result.description = ""
130
131 seqlenlen = len(str(len(result)))
132 if len(result.id) + 1 + seqlenlen > 28:
133 # Genbank has a max length for the id and sequence length number, truncate the sequence id ov too long
134 result.id = result.id[:27 - seqlenlen]
135
136 result.seq.alphabet = Alphabet.generic_dna # TODO Investigate why this is required for some datasets
137 SeqIO.write(result, sys.stdout, draft_format)