# HG changeset patch # User drosofff # Date 1478709152 18000 # Node ID 1991c830504a5f9f0d54b1c0bf07f254fe3ebac2 # Parent 6dfa79a6908a42e417d4a024275245fe3dbd6bae planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_blastparser_and_hits commit b6de14061c479f0418cd89e26d6f5ac26e565a07 diff -r 6dfa79a6908a -r 1991c830504a BlastParser_and_hits.py --- a/BlastParser_and_hits.py Tue Apr 05 05:19:08 2016 -0400 +++ b/BlastParser_and_hits.py Wed Nov 09 11:32:32 2016 -0500 @@ -2,17 +2,17 @@ # blastn tblastn blastx parser revised 14-1-2016. # drosofff@gmail.com -import sys import argparse from collections import defaultdict + def Parser(): the_parser = argparse.ArgumentParser() the_parser.add_argument('--blast', action="store", type=str, help="Path to the blast output (tabular format, 12 column)") the_parser.add_argument('--sequences', action="store", type=str, help="Path to the fasta file with blasted sequences") the_parser.add_argument('--fastaOutput', action="store", type=str, help="fasta output file of blast hits") the_parser.add_argument('--tabularOutput', action="store", type=str, help="tabular output file of blast analysis") - the_parser.add_argument('--flanking', action="store", type=int, help="number of flanking nucleotides added to the hit sequences") + the_parser.add_argument('--flanking', action="store", type=int, help="number of flanking nucleotides added to the hit sequences") the_parser.add_argument('--mode', action="store", choices=["verbose", "short"], type=str, help="reporting (verbose) or not reporting (short) oases contigs") the_parser.add_argument('--filter_relativeCov', action="store", type=float, default=0, help="filter out relative coverages below the specified ratio (float number)") the_parser.add_argument('--filter_maxScore', action="store", type=float, default=0, help="filter out best BitScores below the specified float number") @@ -23,29 +23,32 @@ the_parser.add_argument('--un_sequences', action="store", type=str, help="sequences that have not been blast aligned") the_parser.add_argument('--dataset_name', action="store", type=str, default="", help="the name of the dataset that has been parsed, to be reported in the output") args = the_parser.parse_args() - if not all ( (args.sequences, args.blast, args.fastaOutput, args.tabularOutput) ): + if not all((args.sequences, args.blast, args.fastaOutput, args.tabularOutput)): the_parser.error('argument(s) missing, call the -h option of the script') if not args.flanking: args.flanking = 0 return args + def median(lst): lst = sorted(lst) if len(lst) < 1: return None - if len(lst) %2 == 1: + if len(lst) % 2 == 1: return lst[((len(lst)+1)/2)-1] - if len(lst) %2 == 0: + if len(lst) % 2 == 0: return float(sum(lst[(len(lst)/2)-1:(len(lst)/2)+1]))/2.0 + def mean(lst): if len(lst) < 1: return 0 return sum(lst) / float(len(lst)) -def getfasta (fastafile): + +def getfasta(fastafile): fastadic = {} - for line in open (fastafile): + for line in open(fastafile): if line[0] == ">": header = line[1:-1] fastadic[header] = "" @@ -55,13 +58,15 @@ fastadic[header] = "".join(fastadic[header].split("\n")) return fastadic + def insert_newlines(string, every=60): lines = [] for i in xrange(0, len(string), every): lines.append(string[i:i+every]) return '\n'.join(lines) - -def getblast (blastfile): + + +def getblast(blastfile): '''blastinfo [0] Percentage of identical matches blastinfo [1] Alignment length blastinfo [2] Number of mismatches @@ -73,25 +78,26 @@ blastinfo [8] Expectation value (E-value) blastinfo [9] Bit score blastinfo [10] Subject length (NEED TO BE SPECIFIED WHEN RUNNING BLAST) ''' - blastdic = defaultdict (dict) - for line in open (blastfile): + blastdic = defaultdict(dict) + for line in open(blastfile): fields = line[:-1].split("\t") transcript = fields[0] subject = fields[1] - blastinfo = [float(fields[2]) ] # blastinfo[0] - blastinfo = blastinfo + [int(i) for i in fields[3:10] ] # blastinfo[1:8] insets 1 to 7 - blastinfo.append(fields[10]) # blastinfo[8] E-value remains as a string type - blastinfo.append(float(fields[11])) # blastinfo[9] Bit score - blastinfo.append(int(fields[12])) # blastinfo[10] Subject length MUST BE RETRIEVED THROUGH A 13 COLUMN BLAST OUTPUT + blastinfo = [float(fields[2])] # blastinfo[0] + blastinfo = blastinfo + [int(i) for i in fields[3:10]] # blastinfo[1:8] insets 1 to 7 + blastinfo.append(fields[10]) # blastinfo[8] E-value remains as a string type + blastinfo.append(float(fields[11])) # blastinfo[9] Bit score + blastinfo.append(int(fields[12])) # blastinfo[10] Subject length MUST BE RETRIEVED THROUGH A 13 COLUMN BLAST OUTPUT try: blastdic[subject][transcript].append(blastinfo) - except: - blastdic[subject][transcript] = [ blastinfo ] + except Exception: + blastdic[subject][transcript] = [blastinfo] return blastdic -def getseq (fastadict, transcript, up, down, orientation="direct"): - def reverse (seq): - revdict = {"A":"T","T":"A","G":"C","C":"G","N":"N"} + +def getseq(fastadict, transcript, up, down, orientation="direct"): + def reverse(seq): + revdict = {"A": "T", "T": "A", "G": "C", "C": "G", "N": "N"} revseq = [revdict[i] for i in seq[::-1]] return "".join(revseq) pickseq = fastadict[transcript][up-1:down] @@ -100,7 +106,8 @@ else: return reverse(pickseq) -def subjectCoverage (fastadict, blastdict, subject, QueriesFlankingNucleotides=0): + +def subjectCoverage(fastadict, blastdict, subject, QueriesFlankingNucleotides=0): SubjectCoverageList = [] HitDic = {} bitScores = [] @@ -110,15 +117,16 @@ for hit in blastdict[subject][transcript]: hitNumber += 1 suffix = "hit%s_IdMatch=%s,AligLength=%s,E-val=%s" % (hitNumber, hit[0], hit[1], hit[8]) - HitDic[prefix+suffix] = GetHitSequence (fastadict, transcript, hit[4], hit[5], QueriesFlankingNucleotides) #query coverage by a hit is in hit[4:6] - SubjectCoverageList += range (min([hit[6], hit[7]]), max([hit[6], hit[7]]) + 1) # subject coverage by a hit is in hit[6:8] + HitDic[prefix+suffix] = GetHitSequence(fastadict, transcript, hit[4], hit[5], QueriesFlankingNucleotides) # query coverage by a hit is in hit[4:6] + SubjectCoverageList += range(min([hit[6], hit[7]]), max([hit[6], hit[7]]) + 1) # subject coverage by a hit is in hit[6:8] bitScores.append(hit[9]) - subjectLength = hit [10] # always the same value for a given subject. Stupid but simple - TotalSubjectCoverage = len ( set (SubjectCoverageList) ) + subjectLength = hit[10] # always the same value for a given subject. Stupid but simple + TotalSubjectCoverage = len(set(SubjectCoverageList)) RelativeSubjectCoverage = TotalSubjectCoverage/float(subjectLength) return HitDic, subjectLength, TotalSubjectCoverage, RelativeSubjectCoverage, max(bitScores), mean(bitScores) - -def GetHitSequence (fastadict, FastaHeader, leftCoordinate, rightCoordinate, FlankingValue): + + +def GetHitSequence(fastadict, FastaHeader, leftCoordinate, rightCoordinate, FlankingValue): if rightCoordinate > leftCoordinate: polarity = "direct" else: @@ -128,18 +136,19 @@ leftCoordinate -= FlankingValue else: leftCoordinate = 1 - return getseq (fastadict, FastaHeader, leftCoordinate, rightCoordinate, polarity) - -def outputParsing (dataset_name, F, Fasta, results, Xblastdict, fastadict, filter_relativeCov=0, filter_maxScore=0, filter_meanScore=0, filter_term_in="", filter_term_out="", mode="verbose"): - def filter_results (results, filter_relativeCov=0, filter_maxScore=0, filter_meanScore=0, filter_term_in="", filter_term_out=""): + return getseq(fastadict, FastaHeader, leftCoordinate, rightCoordinate, polarity) + + +def outputParsing(dataset_name, F, Fasta, results, Xblastdict, fastadict, filter_relativeCov=0, filter_maxScore=0, filter_meanScore=0, filter_term_in="", filter_term_out="", mode="verbose"): + def filter_results(results, filter_relativeCov=0, filter_maxScore=0, filter_meanScore=0, filter_term_in="", filter_term_out=""): for subject in results.keys(): - if results[subject]["RelativeSubjectCoverage"]>F, "--- %s ---" % (dataset_name) print >>F, "# SeqId\t%Identity\tAlignLength\tStartSubject\tEndSubject\t%QueryHitCov\tE-value\tBitScore" - for subject in sorted (results, key=lambda x: results[x]["meanBitScores"], reverse=True): + for subject in sorted(results, key=lambda x: results[x]["meanBitScores"], reverse=True): print >> F, " \n# %s" % subject print >> F, "# Suject Length: %s" % (results[subject]["subjectLength"]) print >> F, "# Total Subject Coverage: %s" % (results[subject]["TotalCoverage"]) @@ -171,8 +180,8 @@ print >> F, "# Best Bit Score: %s" % (results[subject]["maxBitScores"]) print >> F, "# Mean Bit Score: %s" % (results[subject]["meanBitScores"]) for header in results[subject]["HitDic"]: - print >> Fasta, ">%s\n%s" % (header, insert_newlines(results[subject]["HitDic"][header]) ) - print >> Fasta, "" # final carriage return for the sequence + print >> Fasta, ">%s\n%s" % (header, insert_newlines(results[subject]["HitDic"][header])) + print >> Fasta, "" # final carriage return for the sequence for transcript in Xblastdict[subject]: transcriptSize = float(len(fastadict[transcript])) for hit in Xblastdict[subject][transcript]: @@ -185,7 +194,7 @@ else: print >>F, "--- %s ---" % (dataset_name) print >>F, "# subject\tsubject length\tTotal Subject Coverage\tRelative Subject Coverage\tBest Bit Score\tMean Bit Score" - for subject in sorted (results, key=lambda x: results[x]["meanBitScores"], reverse=True): + for subject in sorted(results, key=lambda x: results[x]["meanBitScores"], reverse=True): line = [] line.append(subject) line.append(results[subject]["subjectLength"]) @@ -196,36 +205,43 @@ line = [str(i) for i in line] print >> F, "\t".join(line) for header in results[subject]["HitDic"]: - print >> Fasta, ">%s\n%s" % (header, insert_newlines(results[subject]["HitDic"][header]) ) - print >> Fasta, "" # final carriage return for the sequence + print >> Fasta, ">%s\n%s" % (header, insert_newlines(results[subject]["HitDic"][header])) + print >> Fasta, "" # final carriage return for the sequence F.close() Fasta.close() return blasted_transcripts - -def dispatch_sequences (fastadict, blasted_transcripts, matched_sequences, unmatched_sequences): + + +def dispatch_sequences(fastadict, blasted_transcripts, matched_sequences, unmatched_sequences): '''to output the sequences that matched and did not matched in the blast''' - F_matched = open (matched_sequences, "w") - F_unmatched = open (unmatched_sequences, "w") + F_matched = open(matched_sequences, "w") + F_unmatched = open(unmatched_sequences, "w") for transcript in fastadict: - if transcript in blasted_transcripts: # le list of blasted_transcripts is generated by the outputParsing function - print >> F_matched, ">%s\n%s" % (transcript, insert_newlines(fastadict[transcript]) ) + if transcript in blasted_transcripts: # list of blasted_transcripts is generated by the outputParsing function + print >> F_matched, ">%s\n%s" % (transcript, insert_newlines(fastadict[transcript])) else: - print >> F_unmatched, ">%s\n%s" % (transcript, insert_newlines(fastadict[transcript]) ) + print >> F_unmatched, ">%s\n%s" % (transcript, insert_newlines(fastadict[transcript])) F_matched.close() F_unmatched.close() return -def __main__ (): + +def __main__(): args = Parser() - fastadict = getfasta (args.sequences) - Xblastdict = getblast (args.blast) + fastadict = getfasta(args.sequences) + Xblastdict = getblast(args.blast) results = defaultdict(dict) for subject in Xblastdict: - results[subject]["HitDic"], results[subject]["subjectLength"], results[subject]["TotalCoverage"], results[subject]["RelativeSubjectCoverage"], results[subject]["maxBitScores"], results[subject]["meanBitScores"] = subjectCoverage(fastadict, Xblastdict, subject, args.flanking) - blasted_transcripts = outputParsing (args.dataset_name, args.tabularOutput, args.fastaOutput, results, Xblastdict, fastadict, - filter_relativeCov=args.filter_relativeCov, filter_maxScore=args.filter_maxScore, - filter_meanScore=args.filter_meanScore, filter_term_in=args.filter_term_in, - filter_term_out=args.filter_term_out, mode=args.mode) - dispatch_sequences (fastadict, blasted_transcripts, args.al_sequences, args.un_sequences) + results[subject]["HitDic"], results[subject]["subjectLength"], results[subject]["TotalCoverage"], results[subject]["RelativeSubjectCoverage"], results[subject]["maxBitScores"], results[subject]["meanBitScores"] = subjectCoverage(fastadict, Xblastdict, subject, args.flanking) + blasted_transcripts = outputParsing(args.dataset_name, args.tabularOutput, + args.fastaOutput, results, Xblastdict, fastadict, + filter_relativeCov=args.filter_relativeCov, + filter_maxScore=args.filter_maxScore, + filter_meanScore=args.filter_meanScore, + filter_term_in=args.filter_term_in, + filter_term_out=args.filter_term_out, + mode=args.mode) + dispatch_sequences(fastadict, blasted_transcripts, args.al_sequences, args.un_sequences) -if __name__=="__main__": __main__() \ No newline at end of file +if __name__ == "__main__": + __main__() diff -r 6dfa79a6908a -r 1991c830504a BlastParser_and_hits.xml --- a/BlastParser_and_hits.xml Tue Apr 05 05:19:08 2016 -0400 +++ b/BlastParser_and_hits.xml Wed Nov 09 11:32:32 2016 -0500 @@ -1,12 +1,12 @@ for virus discovery - -BlastParser_and_hits.py - --sequences $sequences - --blast $blast - --tabularOutput $tabularOutput - --fastaOutput $fastaOutput + + ]]> - + - +