diff BlastParser_and_hits.py @ 1:1964514aabde draft

planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit 1cc2b50091f512593c502176619998f5908fc8e8
author drosofff
date Mon, 14 Sep 2015 12:18:46 -0400
parents 69ea2a13947f
children bb0d4cd765c5
line wrap: on
line diff
--- a/BlastParser_and_hits.py	Sun Jun 21 14:31:29 2015 -0400
+++ b/BlastParser_and_hits.py	Mon Sep 14 12:18:46 2015 -0400
@@ -14,6 +14,11 @@
     the_parser.add_argument('--tabularOutput', action="store", type=str, help="tabular output file of blast analysis")
     the_parser.add_argument('--flanking', action="store", type=int, help="number of flanking nucleotides added to the hit sequences") 
     the_parser.add_argument('--mode', action="store", choices=["verbose", "short"], type=str, help="reporting (verbose) or not reporting (short) oases contigs")
+    the_parser.add_argument('--filter_relativeCov', action="store", type=float, default=0, help="filter out relative coverages below the specified ratio (float number)")
+    the_parser.add_argument('--filter_maxScore', action="store", type=float, default=0, help="filter out maximum BitScore below the specified float number")
+    the_parser.add_argument('--filter_meanScore', action="store", type=float, default=0, help="filter out maximum BitScore below the specified float number")
+    the_parser.add_argument('--al_sequences', action="store", type=str, help="sequences that have been blast aligned")
+    the_parser.add_argument('--un_sequences', action="store", type=str, help="sequences that have not been blast aligned")
     args = the_parser.parse_args()
     if not all ( (args.sequences, args.blast, args.fastaOutput, args.tabularOutput) ):
         the_parser.error('argument(s) missing, call the -h option of the script')
@@ -122,12 +127,14 @@
         leftCoordinate = 1
     return getseq (fastadict, FastaHeader, leftCoordinate, rightCoordinate, polarity)
     
-def outputParsing (F, Fasta, results, Xblastdict, fastadict, mode="verbose"):
+def outputParsing (F, Fasta, results, Xblastdict, fastadict, filter_relativeCov=0, filter_maxScore=0, filter_meanScore=0, mode="verbose"):
     F= open(F, "w")
     Fasta=open(Fasta, "w")
     if mode == "verbose":
         print >>F, "# SeqId\t%Identity\tAlignLength\tStartSubject\tEndSubject\t%QueryHitCov\tE-value\tBitScore\n"
         for subject in sorted (results, key=lambda x: results[x]["meanBitScores"], reverse=True):
+            if results[subject]["RelativeSubjectCoverage"]<filter_relativeCov or results[subject]["maxBitScores"]<filter_maxScore or results[subject]["meanBitScores"]<filter_meanScore:
+                continue
             print >> F, "#\n# %s" % subject
             print >> F, "# Suject Length: %s" % (results[subject]["subjectLength"])
             print >> F, "# Total Subject Coverage: %s" % (results[subject]["TotalCoverage"])
@@ -149,6 +156,8 @@
     else:
         print >>F, "# subject\tsubject length\tTotal Subject Coverage\tRelative Subject Coverage\tMaximum Bit Score\tMean Bit Score"
         for subject in sorted (results, key=lambda x: results[x]["meanBitScores"], reverse=True):
+            if results[subject]["RelativeSubjectCoverage"]<filter_relativeCov or results[subject]["maxBitScores"]<filter_maxScore or results[subject]["meanBitScores"]<filter_meanScore:
+                continue
             line = []
             line.append(subject)
             line.append(results[subject]["subjectLength"])
@@ -164,14 +173,33 @@
     F.close()
     Fasta.close()
         
-    
+def sort_sequences (fastadict, blastdict, matched_sequences, unmatched_sequences):
+    '''to output the sequences that matched and did not matched in the blast'''
+    blasted_transcripts = []
+    for subject in blastdict:
+        for transcript in blastdict[subject]:
+            blasted_transcripts.append(transcript)
+    blasted_transcripts = list( set( blasted_transcripts))
+    F_matched = open (matched_sequences, "w")
+    F_unmatched = open (unmatched_sequences, "w")
+    for transcript in fastadict:
+        if transcript in blasted_transcripts:
+            print >> F_matched, ">%s\n%s" % (transcript, insert_newlines(fastadict[transcript]) )
+        else:
+            print >> F_unmatched, ">%s\n%s" % (transcript, insert_newlines(fastadict[transcript]) )
+    F_matched.close()
+    F_unmatched.close()
+    return
 
 def __main__ ():
     args = Parser()
     fastadict = getfasta (args.sequences)
     Xblastdict = getblast (args.blast)
+    sort_sequences (fastadict, Xblastdict, args.al_sequences, args.un_sequences)
     results = defaultdict(dict)
     for subject in Xblastdict:
         results[subject]["HitDic"], results[subject]["subjectLength"], results[subject]["TotalCoverage"], results[subject]["RelativeSubjectCoverage"], results[subject]["maxBitScores"], results[subject]["meanBitScores"]  = subjectCoverage(fastadict, Xblastdict, subject, args.flanking)
-    outputParsing (args.tabularOutput, args.fastaOutput, results, Xblastdict, fastadict, args.mode)
+    outputParsing (args.tabularOutput, args.fastaOutput, results, Xblastdict, fastadict,
+                  filter_relativeCov=args.filter_relativeCov, filter_maxScore=args.filter_maxScore,
+                  filter_meanScore=args.filter_meanScore, mode=args.mode)
 if __name__=="__main__": __main__()