Mercurial > repos > drosofff > msp_blastparser_and_hits
diff BlastParser_and_hits.py @ 0:69ea2a13947f draft
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
author | drosofff |
---|---|
date | Sun, 21 Jun 2015 14:31:29 -0400 |
parents | |
children | 1964514aabde |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/BlastParser_and_hits.py Sun Jun 21 14:31:29 2015 -0400 @@ -0,0 +1,177 @@ +#!/usr/bin/python +# blastn blastx parser revised debugged: 3-4-2015. Commit issue. +# drosofff@gmail.com + +import sys +import argparse +from collections import defaultdict + +def Parser(): + the_parser = argparse.ArgumentParser() + the_parser.add_argument('--blast', action="store", type=str, help="Path to the blast output (tabular format, 12 column)") + the_parser.add_argument('--sequences', action="store", type=str, help="Path to the fasta file with blasted sequences") + the_parser.add_argument('--fastaOutput', action="store", type=str, help="fasta output file of blast hits") + the_parser.add_argument('--tabularOutput', action="store", type=str, help="tabular output file of blast analysis") + the_parser.add_argument('--flanking', action="store", type=int, help="number of flanking nucleotides added to the hit sequences") + the_parser.add_argument('--mode', action="store", choices=["verbose", "short"], type=str, help="reporting (verbose) or not reporting (short) oases contigs") + args = the_parser.parse_args() + if not all ( (args.sequences, args.blast, args.fastaOutput, args.tabularOutput) ): + the_parser.error('argument(s) missing, call the -h option of the script') + if not args.flanking: + args.flanking = 0 + return args + +def median(lst): + lst = sorted(lst) + if len(lst) < 1: + return None + if len(lst) %2 == 1: + return lst[((len(lst)+1)/2)-1] + if len(lst) %2 == 0: + return float(sum(lst[(len(lst)/2)-1:(len(lst)/2)+1]))/2.0 + +def mean(lst): + if len(lst) < 1: + return 0 + return sum(lst) / float(len(lst)) + +def getfasta (fastafile): + fastadic = {} + for line in open (fastafile): + if line[0] == ">": + header = line[1:-1] + fastadic[header] = "" + else: + fastadic[header] += line + for header in fastadic: + fastadic[header] = "".join(fastadic[header].split("\n")) + return fastadic + +def insert_newlines(string, every=60): + lines = [] + for i in xrange(0, len(string), every): + lines.append(string[i:i+every]) + return '\n'.join(lines) + +def getblast (blastfile): + '''blastinfo [0] Percentage of identical matches + blastinfo [1] Alignment length + blastinfo [2] Number of mismatches + blastinfo [3] Number of gap openings + blastinfo [4] Start of alignment in query + blastinfo [5] End of alignment in query + blastinfo [6] Start of alignment in subject (database hit) + blastinfo [7] End of alignment in subject (database hit) + blastinfo [8] Expectation value (E-value) + blastinfo [9] Bit score + blastinfo [10] Subject length (NEED TO BE SPECIFIED WHEN RUNNING BLAST) ''' + blastdic = defaultdict (dict) + for line in open (blastfile): + fields = line[:-1].split("\t") + transcript = fields[0] + subject = fields[1] + blastinfo = [float(fields[2]) ] # blastinfo[0] + blastinfo = blastinfo + [int(i) for i in fields[3:10] ] # blastinfo[1:8] insets 1 to 7 + blastinfo.append(fields[10]) # blastinfo[8] E-value remains as a string type + blastinfo.append(float(fields[11])) # blastinfo[9] Bit score + blastinfo.append(int(fields[12])) # blastinfo[10] Subject length MUST BE RETRIEVED THROUGH A 13 COLUMN BLAST OUTPUT + try: + blastdic[subject][transcript].append(blastinfo) + except: + blastdic[subject][transcript] = [ blastinfo ] + return blastdic + +def getseq (fastadict, transcript, up, down, orientation="direct"): + def reverse (seq): + revdict = {"A":"T","T":"A","G":"C","C":"G","N":"N"} + revseq = [revdict[i] for i in seq[::-1]] + return "".join(revseq) + pickseq = fastadict[transcript][up-1:down] + if orientation == "direct": + return pickseq + else: + return reverse(pickseq) + +def subjectCoverage (fastadict, blastdict, subject, QueriesFlankingNucleotides=0): + SubjectCoverageList = [] + HitDic = {} + bitScores = [] + for transcript in blastdict[subject]: + prefix = "%s--%s_" % (subject, transcript) + hitNumber = 0 + for hit in blastdict[subject][transcript]: + hitNumber += 1 + suffix = "hit%s_IdMatch=%s,AligLength=%s,E-val=%s" % (hitNumber, hit[0], hit[1], hit[8]) + HitDic[prefix+suffix] = GetHitSequence (fastadict, transcript, hit[4], hit[5], QueriesFlankingNucleotides) #query coverage by a hit is in hit[4:6] + SubjectCoverageList += range (min([hit[6], hit[7]]), max([hit[6], hit[7]]) + 1) # subject coverage by a hit is in hit[6:8] + bitScores.append(hit[9]) + subjectLength = hit [10] # always the same value for a given subject. Stupid but simple + TotalSubjectCoverage = len ( set (SubjectCoverageList) ) + RelativeSubjectCoverage = TotalSubjectCoverage/float(subjectLength) + return HitDic, subjectLength, TotalSubjectCoverage, RelativeSubjectCoverage, max(bitScores), mean(bitScores) + +def GetHitSequence (fastadict, FastaHeader, leftCoordinate, rightCoordinate, FlankingValue): + if rightCoordinate > leftCoordinate: + polarity = "direct" + else: + polarity = "reverse" + leftCoordinate, rightCoordinate = rightCoordinate, leftCoordinate + if leftCoordinate - FlankingValue > 0: + leftCoordinate -= FlankingValue + else: + leftCoordinate = 1 + return getseq (fastadict, FastaHeader, leftCoordinate, rightCoordinate, polarity) + +def outputParsing (F, Fasta, results, Xblastdict, fastadict, mode="verbose"): + F= open(F, "w") + Fasta=open(Fasta, "w") + if mode == "verbose": + print >>F, "# SeqId\t%Identity\tAlignLength\tStartSubject\tEndSubject\t%QueryHitCov\tE-value\tBitScore\n" + for subject in sorted (results, key=lambda x: results[x]["meanBitScores"], reverse=True): + print >> F, "#\n# %s" % subject + print >> F, "# Suject Length: %s" % (results[subject]["subjectLength"]) + print >> F, "# Total Subject Coverage: %s" % (results[subject]["TotalCoverage"]) + print >> F, "# Relative Subject Coverage: %s" % (results[subject]["RelativeSubjectCoverage"]) + print >> F, "# Maximum Bit Score: %s" % (results[subject]["maxBitScores"]) + print >> F, "# Mean Bit Score: %s" % (results[subject]["meanBitScores"]) + for header in results[subject]["HitDic"]: + print >> Fasta, ">%s\n%s" % (header, insert_newlines(results[subject]["HitDic"][header]) ) + print >> Fasta, "" # final carriage return for the sequence + for transcript in Xblastdict[subject]: + transcriptSize = float(len(fastadict[transcript])) + for hit in Xblastdict[subject][transcript]: + percentIdentity, alignLenght, subjectStart, subjectEnd, queryCov = hit[0], hit[1], hit[6], hit[7], "%.1f" % (abs(hit[5]-hit[4])/transcriptSize*100) + Eval, BitScore = hit[8], hit[9] + info = [transcript] + [percentIdentity, alignLenght, subjectStart, subjectEnd, queryCov, Eval, BitScore] + info = [str(i) for i in info] + info = "\t".join(info) + print >> F, info + else: + print >>F, "# subject\tsubject length\tTotal Subject Coverage\tRelative Subject Coverage\tMaximum Bit Score\tMean Bit Score" + for subject in sorted (results, key=lambda x: results[x]["meanBitScores"], reverse=True): + line = [] + line.append(subject) + line.append(results[subject]["subjectLength"]) + line.append(results[subject]["TotalCoverage"]) + line.append(results[subject]["RelativeSubjectCoverage"]) + line.append(results[subject]["maxBitScores"]) + line.append(results[subject]["meanBitScores"]) + line = [str(i) for i in line] + print >> F, "\t".join(line) + for header in results[subject]["HitDic"]: + print >> Fasta, ">%s\n%s" % (header, insert_newlines(results[subject]["HitDic"][header]) ) + print >> Fasta, "" # final carriage return for the sequence + F.close() + Fasta.close() + + + +def __main__ (): + args = Parser() + fastadict = getfasta (args.sequences) + Xblastdict = getblast (args.blast) + results = defaultdict(dict) + for subject in Xblastdict: + results[subject]["HitDic"], results[subject]["subjectLength"], results[subject]["TotalCoverage"], results[subject]["RelativeSubjectCoverage"], results[subject]["maxBitScores"], results[subject]["meanBitScores"] = subjectCoverage(fastadict, Xblastdict, subject, args.flanking) + outputParsing (args.tabularOutput, args.fastaOutput, results, Xblastdict, fastadict, args.mode) +if __name__=="__main__": __main__()