comparison BlastParser_and_hits.py @ 0:9dfb65ebb02e draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/blastparser_and_hits commit 48132e5edac97d54804ccbaf620068a5fb800bdc
author artbio
date Sun, 15 Oct 2017 18:43:37 -0400
parents
children 36103afa0934
comparison
equal deleted inserted replaced
-1:000000000000 0:9dfb65ebb02e
1 #!/usr/bin/python
2 import argparse
3 from collections import defaultdict
4
5
6 def Parser():
7 the_parser = argparse.ArgumentParser()
8 the_parser.add_argument('--blast', action="store", type=str,
9 help="Path to the blast output\
10 (tabular format, 12 column)")
11 the_parser.add_argument('--sequences', action="store", type=str,
12 help="Path to the fasta file with blasted\
13 sequences")
14 the_parser.add_argument('--fastaOutput', action="store", type=str,
15 help="fasta output file of blast hits")
16 the_parser.add_argument('--tabularOutput', action="store", type=str,
17 help="tabular output file of blast analysis")
18 the_parser.add_argument('--flanking', action="store", type=int,
19 help="number of flanking nucleotides\
20 added to the hit sequences")
21 the_parser.add_argument('--mode', action="store",
22 choices=["verbose", "short"], type=str,
23 help="reporting (verbose) or not reporting (short)\
24 oases contigs")
25 the_parser.add_argument('--filter_relativeCov', action="store", type=float,
26 default=0,
27 help="filter out relative coverages\
28 below the specified ratio (float number)")
29 the_parser.add_argument('--filter_maxScore', action="store", type=float,
30 default=0, help="filter out best BitScores below\
31 the specified float number")
32 the_parser.add_argument('--filter_meanScore', action="store", type=float,
33 default=0,
34 help="filter out mean BitScores below the\
35 specified float number")
36 the_parser.add_argument('--filter_term_in', action="store", type=str,
37 default="",
38 help="select the specified term in the\
39 subject list")
40 the_parser.add_argument('--filter_term_out', action="store", type=str,
41 default="",
42 help="exclude the specified term from\
43 the subject list")
44 the_parser.add_argument('--al_sequences', action="store", type=str,
45 help="sequences that have been blast aligned")
46 the_parser.add_argument('--un_sequences', action="store", type=str,
47 help="sequences that have not been blast aligned")
48 the_parser.add_argument('--dataset_name', action="store", type=str,
49 default="",
50 help="the name of the dataset that has been parsed,\
51 to be reported in the output")
52 args = the_parser.parse_args()
53 if not all((args.sequences, args.blast, args.fastaOutput,
54 args.tabularOutput)):
55 the_parser.error('argument(s) missing, call the\
56 -h option of the script')
57 if not args.flanking:
58 args.flanking = 0
59 return args
60
61
62 def median(lst):
63 lst = sorted(lst)
64 if len(lst) < 1:
65 return None
66 if len(lst) % 2 == 1:
67 return lst[((len(lst)+1)/2)-1]
68 if len(lst) % 2 == 0:
69 return float(sum(lst[(len(lst)/2)-1:(len(lst)/2)+1]))/2.0
70
71
72 def mean(lst):
73 if len(lst) < 1:
74 return 0
75 return sum(lst) / float(len(lst))
76
77
78 def getfasta(fastafile):
79 fastadic = {}
80 for line in open(fastafile):
81 if line[0] == ">":
82 header = line[1:-1]
83 fastadic[header] = ""
84 else:
85 fastadic[header] += line
86 for header in fastadic:
87 fastadic[header] = "".join(fastadic[header].split("\n"))
88 return fastadic
89
90
91 def insert_newlines(string, every=60):
92 lines = []
93 for i in range(0, len(string), every):
94 lines.append(string[i:i+every])
95 return '\n'.join(lines)
96
97
98 def getblast(blastfile):
99 '''blastinfo [0] Percentage of identical matches
100 blastinfo [1] Alignment length
101 blastinfo [2] Number of mismatches
102 blastinfo [3] Number of gap openings
103 blastinfo [4] Start of alignment in query
104 blastinfo [5] End of alignment in query
105 blastinfo [6] Start of alignment in subject (database hit)
106 blastinfo [7] End of alignment in subject (database hit)
107 blastinfo [8] Expectation value (E-value)
108 blastinfo [9] Bit score
109 blastinfo [10] Subject length
110 (NEED TO BE SPECIFIED WHEN RUNNING BLAST) '''
111 blastdic = defaultdict(dict)
112 for line in open(blastfile):
113 fields = line[:-1].split("\t")
114 transcript = fields[0]
115 subject = fields[1]
116 # blastinfo[0]
117 blastinfo = [float(fields[2])]
118 # blastinfo[1:8] insets 1 to 7
119 blastinfo = blastinfo + [int(i) for i in fields[3:10]]
120 # blastinfo[8] E-value remains as a string type
121 blastinfo.append(fields[10])
122 # blastinfo[9] Bit score
123 blastinfo.append(float(fields[11]))
124 # blastinfo[10] Subject length MUST BE RETRIEVED
125 # THROUGH A 13 COLUMN BLAST OUTPUT
126 blastinfo.append(int(fields[12]))
127 try:
128 blastdic[subject][transcript].append(blastinfo)
129 except Exception:
130 blastdic[subject][transcript] = [blastinfo]
131 return blastdic
132
133
134 def getseq(fastadict, transcript, up, down, orientation="direct"):
135 def reverse(seq):
136 revdict = {"A": "T", "T": "A", "G": "C", "C": "G", "N": "N"}
137 revseq = [revdict[i] for i in seq[::-1]]
138 return "".join(revseq)
139 pickseq = fastadict[transcript][up-1:down]
140 if orientation == "direct":
141 return pickseq
142 else:
143 return reverse(pickseq)
144
145
146 def subjectCoverage(fastadict, blastdict, subject,
147 QueriesFlankingNucleotides=0):
148 SubjectCoverageList = []
149 HitDic = {}
150 bitScores = []
151 for transcript in blastdict[subject]:
152 prefix = "%s--%s_" % (subject, transcript)
153 hitNumber = 0
154 for hit in blastdict[subject][transcript]:
155 hitNumber += 1
156 suffix = "hit%s_IdMatch=%s,AligLength=%s,E-val=%s" % (hitNumber,
157 hit[0],
158 hit[1],
159 hit[8])
160 # query coverage by a hit is in hit[4:6]
161 HitDic[prefix+suffix] = GetHitSequence(fastadict, transcript,
162 hit[4], hit[5],
163 QueriesFlankingNucleotides)
164 # subject coverage by a hit is in hit[6:8]
165 SubjectCoverageList += range(min([hit[6], hit[7]]),
166 max([hit[6], hit[7]]) + 1)
167 bitScores.append(hit[9])
168 # always the same value for a given subject. Stupid but simple
169 subjectLength = hit[10]
170 TotalSubjectCoverage = len(set(SubjectCoverageList))
171 RelativeSubjectCoverage = TotalSubjectCoverage/float(subjectLength)
172 return (HitDic, subjectLength, TotalSubjectCoverage,
173 RelativeSubjectCoverage, max(bitScores), mean(bitScores))
174
175
176 def GetHitSequence(fastadict, FastaHeader, leftCoordinate, rightCoordinate,
177 FlankingValue):
178 if rightCoordinate > leftCoordinate:
179 polarity = "direct"
180 else:
181 polarity = "reverse"
182 leftCoordinate, rightCoordinate = rightCoordinate, leftCoordinate
183 if leftCoordinate - FlankingValue > 0:
184 leftCoordinate -= FlankingValue
185 else:
186 leftCoordinate = 1
187 return getseq(fastadict, FastaHeader, leftCoordinate, rightCoordinate,
188 polarity)
189
190
191 def outputParsing(dataset_name, F, Fasta, results, Xblastdict, fastadict,
192 filter_relativeCov=0, filter_maxScore=0, filter_meanScore=0,
193 filter_term_in="", filter_term_out="", mode="verbose"):
194 def filter_results(results, filter_relativeCov=0, filter_maxScore=0,
195 filter_meanScore=0, filter_term_in="",
196 filter_term_out=""):
197 for subject in results.keys():
198 if results[subject][
199 "RelativeSubjectCoverage"] < filter_relativeCov:
200 del results[subject]
201 continue
202 if results[subject]["maxBitScores"] < filter_maxScore:
203 del results[subject]
204 continue
205 if results[subject]["meanBitScores"] < filter_meanScore:
206 del results[subject]
207 continue
208 if filter_term_in in subject:
209 pass
210 else:
211 del results[subject]
212 continue
213 if filter_term_out and filter_term_out in subject:
214 del results[subject]
215 continue
216 return results
217
218 F = open(F, "w")
219 Fasta = open(Fasta, "w")
220 blasted_transcripts = []
221 filter_results(results, filter_relativeCov, filter_maxScore,
222 filter_meanScore, filter_term_in, filter_term_out)
223 for subject in results:
224 for transcript in Xblastdict[subject]:
225 blasted_transcripts.append(transcript)
226 blasted_transcripts = list(set(blasted_transcripts))
227 if mode == "verbose":
228 F.write("--- %s ---\n" % dataset_name)
229 F.write("# %s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ("SeqId", "%Identity",
230 "AlignLength",
231 "StartSubject",
232 "EndSubject",
233 "%QueryHitCov",
234 "E-value",
235 "BitScore"))
236 for subject in sorted(results,
237 key=lambda x: results[x]["meanBitScores"],
238 reverse=True):
239 F.write(" \n# %s\n" % subject)
240 F.write("# Suject Length: %s\n" %
241 results[subject]["subjectLength"])
242 F.write("# Total Subject Coverage: %s\n" %
243 results[subject]["TotalCoverage"])
244 F.write("# Relative Subject Coverage: %s\n" %
245 results[subject]["RelativeSubjectCoverage"])
246 F.write("# Best Bit Score: %s\n" % results[subject][
247 "maxBitScores"])
248 F.write("# Mean Bit Score: %s\n" % results[subject][
249 "meanBitScores"])
250 for header in results[subject]["HitDic"]:
251 Fasta.write(">%s\n%s\n" % (header,
252 insert_newlines(results[subject][
253 "HitDic"][
254 header])))
255 Fasta.write("\n") # final carriage return for the sequence
256 for transcript in Xblastdict[subject]:
257 transcriptSize = float(len(fastadict[transcript]))
258 for hit in Xblastdict[subject][transcript]:
259 percentIdentity = hit[0]
260 alignLenght = hit[1]
261 subjectStart = hit[6]
262 subjectEnd = hit[7]
263 queryCov = "%.1f" % (abs(hit[5]-hit[4])/transcriptSize*100)
264 Eval, BitScore = hit[8], hit[9]
265 info = [transcript] + [percentIdentity, alignLenght,
266 subjectStart, subjectEnd, queryCov,
267 Eval, BitScore]
268 info = [str(i) for i in info]
269 info = "\t".join(info)
270 F.write("%s\n" % info)
271 else:
272 F.write("--- %s ---\n" % dataset_name)
273 F.write("# subject\tsubject length\tTotal Subject Coverage\tRelative\
274 Subject Coverage\tBest Bit Score\tMean Bit Score\n")
275 for subject in sorted(results,
276 key=lambda x: results[x]["meanBitScores"],
277 reverse=True):
278 line = []
279 line.append(subject)
280 line.append(results[subject]["subjectLength"])
281 line.append(results[subject]["TotalCoverage"])
282 line.append(results[subject]["RelativeSubjectCoverage"])
283 line.append(results[subject]["maxBitScores"])
284 line.append(results[subject]["meanBitScores"])
285 line = [str(i) for i in line]
286 F.write("%s\n" % "\t".join(line))
287 for header in results[subject]["HitDic"]:
288 Fasta.write(">%s\n%s\n" % (header,
289 insert_newlines(
290 results[subject][
291 "HitDic"][header])))
292 Fasta.write("\n") # final carriage return for the sequence
293 F.close()
294 Fasta.close()
295 return blasted_transcripts
296
297
298 def dispatch_sequences(fastadict, blasted_transcripts, matched_sequences,
299 unmatched_sequences):
300 '''to output the sequences that matched and did not matched in the blast'''
301 F_matched = open(matched_sequences, "w")
302 F_unmatched = open(unmatched_sequences, "w")
303 for transcript in fastadict:
304 if transcript in blasted_transcripts:
305 ''''list of blasted_transcripts is generated
306 by the outputParsing function'''
307 F_matched.write(">%s\n%s\n" % (transcript, insert_newlines(
308 fastadict[transcript])))
309 else:
310 F_unmatched.write(">%s\n%s\n" % (transcript, insert_newlines(
311 fastadict[transcript])))
312 F_matched.close()
313 F_unmatched.close()
314 return
315
316
317 def __main__():
318 args = Parser()
319 fastadict = getfasta(args.sequences)
320 Xblastdict = getblast(args.blast)
321 results = defaultdict(dict)
322 for subject in Xblastdict:
323 results[subject]["HitDic"], results[subject]["subjectLength"], results[
324 subject]["TotalCoverage"], results[subject][
325 "RelativeSubjectCoverage"], results[subject][
326 "maxBitScores"], results[subject][
327 "meanBitScores"] = subjectCoverage(fastadict, Xblastdict, subject,
328 args.flanking)
329 blasted_transcripts = outputParsing(
330 args.dataset_name, args.tabularOutput,
331 args.fastaOutput, results, Xblastdict, fastadict,
332 filter_relativeCov=args.filter_relativeCov,
333 filter_maxScore=args.filter_maxScore,
334 filter_meanScore=args.filter_meanScore,
335 filter_term_in=args.filter_term_in,
336 filter_term_out=args.filter_term_out, mode=args.mode)
337 dispatch_sequences(fastadict, blasted_transcripts, args.al_sequences,
338 args.un_sequences)
339
340
341 if __name__ == "__main__":
342 __main__()