comparison blast_to_scaffold.py @ 0:7d96b28eec49 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/blast_to_scaffold commit 48a4098045106f363e92357949b32617a2e868c1
author artbio
date Sun, 15 Oct 2017 12:52:40 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:7d96b28eec49
1 #!/usr/bin/env python
2 import argparse
3
4
5 def insert_newlines(string, every=60):
6 lines = []
7 for i in range(0, len(string), every):
8 lines.append(string[i:i+every])
9 return '\n'.join(lines)
10
11
12 def getseq(fastadict, transcript, up, down, orientation="direct"):
13 def reverse(seq):
14 revdict = {"A": "T", "T": "A", "G": "C", "C": "G", "N": "N"}
15 revseq = [revdict[i] for i in seq[::-1]]
16 return "".join(revseq)
17 pickseq = fastadict[transcript][up-1:down]
18 if orientation == "direct":
19 return pickseq
20 else:
21 return reverse(pickseq)
22
23
24 def Parser():
25 the_parser = argparse.ArgumentParser(
26 description="Generate DNA scaffold from blastn or tblastx alignment\
27 of Contigs")
28 the_parser.add_argument('--sequences', action="store", type=str,
29 help="input sequence file in fasta format")
30 the_parser.add_argument('--guideSequence', action="store", type=str,
31 help="the reference sequence to guide the scaffold\
32 assembly in fasta format")
33 the_parser.add_argument('--blast-tab', dest="blast_tab", action="store",
34 type=str,
35 help="13-columns tabular blastn or tblastx output")
36 the_parser.add_argument('--output', action="store", type=str,
37 help="output file path, fasta format")
38 the_parser.add_argument('--scaffold_prefix', action="store", type=str,
39 help="the prefix that will be used for the header\
40 of the fasta scaffold")
41 the_parser.add_argument('--scaffold_suffix', action="store", type=str,
42 help="the sufix that will be used for the header\
43 of the fasta scaffold")
44 args = the_parser.parse_args()
45 return args
46
47
48 def blatnInfo(file):
49 blastlist = []
50 with open(file, "r") as f:
51 for line in f:
52 minilist = []
53 fields = line.rstrip().split()
54 minilist.append(fields[0])
55 minilist.extend(fields[6:10])
56 blastlist.append(minilist)
57 blastlist.sort(key=lambda x: x[3], reverse=True)
58 return blastlist
59
60
61 def myContigs(file):
62 Contigs = {}
63 with open(file, "r") as f:
64 for line in f:
65 if line[0] == ">":
66 header = line[1:-1]
67 Contigs[header] = ""
68 else:
69 Contigs[header] += line[:-1]
70 return Contigs
71
72
73 def myGuide(file):
74 Guide = {}
75 coordinate = 0
76 with open(file, "r") as f:
77 for line in f:
78 if line[0] == ">":
79 continue
80 else:
81 for nucleotide in line[:-1]:
82 coordinate += 1
83 Guide[coordinate] = nucleotide.lower()
84 return Guide
85
86
87 def updateGuide(blastlist, GuideDict, ContigsDict):
88 '''
89 the blastlist object is a list of list with
90 element [0] : name of the blasted Contig
91 element [1] : queryStart of the alignment to the reference
92 element [2] = queryStop of the alignment to the reference
93 element [3] : subjectStart of the alignment to the reference
94 element [4] = subjectStop of the alignment to the reference
95 '''
96 for fields in blastlist:
97 seqHeader = fields[0]
98 queryStart = int(fields[1])
99 queryStop = int(fields[2])
100 subjectStart = int(fields[3])
101 subjectStop = int(fields[4])
102 if subjectStart > subjectStop:
103 subjectStart, subjectStop = subjectStop, subjectStart
104 orientation = "reverse"
105 else:
106 orientation = "direct"
107 sequence = getseq(ContigsDict, seqHeader, queryStart, queryStop,
108 orientation)
109 for i in range(subjectStart, subjectStop+1):
110 try:
111 del GuideDict[i]
112 except KeyError:
113 continue
114 for i, nucleotide in enumerate(sequence):
115 GuideDict[i+subjectStart] = nucleotide
116
117
118 def finalAssembly(GuideDict, outputfile, prefix, suffix):
119 finalSeqList = []
120 for keys in sorted(GuideDict):
121 finalSeqList.append(GuideDict[keys])
122 finalSequence = insert_newlines("".join(finalSeqList))
123 Out = open(outputfile, "w")
124 Out.write(">Scaffold_from_%s_guided_by_%s\n" % (prefix, suffix))
125 Out.write("%s\n" % finalSequence)
126 Out.close()
127
128
129 def __main__():
130 args = Parser()
131 ContigsDict = myContigs(args.sequences)
132 GuideDict = myGuide(args.guideSequence)
133 blastlist = blatnInfo(args.blast_tab)
134 updateGuide(blastlist, GuideDict, ContigsDict)
135 finalAssembly(GuideDict, args.output, args.scaffold_prefix,
136 args.scaffold_suffix)
137
138
139 if __name__ == "__main__":
140 __main__()