Mercurial > repos > drosofff > msp_blastparser_and_hits
annotate BlastParser_and_hits.py @ 3:8f5d48294f70 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_blastparser_and_hits commit 3d9ddd0f6f3c3b97a3bebf52646731ad6771e178
author | drosofff |
---|---|
date | Mon, 19 Oct 2015 12:13:12 -0400 |
parents | bb0d4cd765c5 |
children | 60b6bd959929 |
rev | line source |
---|---|
0
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
1 #!/usr/bin/python |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
2 # blastn blastx parser revised debugged: 3-4-2015. Commit issue. |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
3 # drosofff@gmail.com |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
4 |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
5 import sys |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
6 import argparse |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
7 from collections import defaultdict |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
8 |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
9 def Parser(): |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
10 the_parser = argparse.ArgumentParser() |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
11 the_parser.add_argument('--blast', action="store", type=str, help="Path to the blast output (tabular format, 12 column)") |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
12 the_parser.add_argument('--sequences', action="store", type=str, help="Path to the fasta file with blasted sequences") |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
13 the_parser.add_argument('--fastaOutput', action="store", type=str, help="fasta output file of blast hits") |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
14 the_parser.add_argument('--tabularOutput', action="store", type=str, help="tabular output file of blast analysis") |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
15 the_parser.add_argument('--flanking', action="store", type=int, help="number of flanking nucleotides added to the hit sequences") |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
16 the_parser.add_argument('--mode', action="store", choices=["verbose", "short"], type=str, help="reporting (verbose) or not reporting (short) oases contigs") |
1
1964514aabde
planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit 1cc2b50091f512593c502176619998f5908fc8e8
drosofff
parents:
0
diff
changeset
|
17 the_parser.add_argument('--filter_relativeCov', action="store", type=float, default=0, help="filter out relative coverages below the specified ratio (float number)") |
3
8f5d48294f70
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_blastparser_and_hits commit 3d9ddd0f6f3c3b97a3bebf52646731ad6771e178
drosofff
parents:
2
diff
changeset
|
18 the_parser.add_argument('--filter_maxScore', action="store", type=float, default=0, help="filter out best BitScores below the specified float number") |
8f5d48294f70
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_blastparser_and_hits commit 3d9ddd0f6f3c3b97a3bebf52646731ad6771e178
drosofff
parents:
2
diff
changeset
|
19 the_parser.add_argument('--filter_meanScore', action="store", type=float, default=0, help="filter out mean BitScores below the specified float number") |
1
1964514aabde
planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit 1cc2b50091f512593c502176619998f5908fc8e8
drosofff
parents:
0
diff
changeset
|
20 the_parser.add_argument('--al_sequences', action="store", type=str, help="sequences that have been blast aligned") |
1964514aabde
planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit 1cc2b50091f512593c502176619998f5908fc8e8
drosofff
parents:
0
diff
changeset
|
21 the_parser.add_argument('--un_sequences', action="store", type=str, help="sequences that have not been blast aligned") |
0
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
22 args = the_parser.parse_args() |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
23 if not all ( (args.sequences, args.blast, args.fastaOutput, args.tabularOutput) ): |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
24 the_parser.error('argument(s) missing, call the -h option of the script') |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
25 if not args.flanking: |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
26 args.flanking = 0 |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
27 return args |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
28 |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
29 def median(lst): |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
30 lst = sorted(lst) |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
31 if len(lst) < 1: |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
32 return None |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
33 if len(lst) %2 == 1: |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
34 return lst[((len(lst)+1)/2)-1] |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
35 if len(lst) %2 == 0: |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
36 return float(sum(lst[(len(lst)/2)-1:(len(lst)/2)+1]))/2.0 |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
37 |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
38 def mean(lst): |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
39 if len(lst) < 1: |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
40 return 0 |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
41 return sum(lst) / float(len(lst)) |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
42 |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
43 def getfasta (fastafile): |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
44 fastadic = {} |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
45 for line in open (fastafile): |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
46 if line[0] == ">": |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
47 header = line[1:-1] |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
48 fastadic[header] = "" |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
49 else: |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
50 fastadic[header] += line |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
51 for header in fastadic: |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
52 fastadic[header] = "".join(fastadic[header].split("\n")) |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
53 return fastadic |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
54 |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
55 def insert_newlines(string, every=60): |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
56 lines = [] |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
57 for i in xrange(0, len(string), every): |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
58 lines.append(string[i:i+every]) |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
59 return '\n'.join(lines) |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
60 |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
61 def getblast (blastfile): |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
62 '''blastinfo [0] Percentage of identical matches |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
63 blastinfo [1] Alignment length |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
64 blastinfo [2] Number of mismatches |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
65 blastinfo [3] Number of gap openings |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
66 blastinfo [4] Start of alignment in query |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
67 blastinfo [5] End of alignment in query |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
68 blastinfo [6] Start of alignment in subject (database hit) |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
69 blastinfo [7] End of alignment in subject (database hit) |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
70 blastinfo [8] Expectation value (E-value) |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
71 blastinfo [9] Bit score |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
72 blastinfo [10] Subject length (NEED TO BE SPECIFIED WHEN RUNNING BLAST) ''' |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
73 blastdic = defaultdict (dict) |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
74 for line in open (blastfile): |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
75 fields = line[:-1].split("\t") |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
76 transcript = fields[0] |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
77 subject = fields[1] |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
78 blastinfo = [float(fields[2]) ] # blastinfo[0] |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
79 blastinfo = blastinfo + [int(i) for i in fields[3:10] ] # blastinfo[1:8] insets 1 to 7 |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
80 blastinfo.append(fields[10]) # blastinfo[8] E-value remains as a string type |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
81 blastinfo.append(float(fields[11])) # blastinfo[9] Bit score |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
82 blastinfo.append(int(fields[12])) # blastinfo[10] Subject length MUST BE RETRIEVED THROUGH A 13 COLUMN BLAST OUTPUT |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
83 try: |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
84 blastdic[subject][transcript].append(blastinfo) |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
85 except: |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
86 blastdic[subject][transcript] = [ blastinfo ] |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
87 return blastdic |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
88 |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
89 def getseq (fastadict, transcript, up, down, orientation="direct"): |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
90 def reverse (seq): |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
91 revdict = {"A":"T","T":"A","G":"C","C":"G","N":"N"} |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
92 revseq = [revdict[i] for i in seq[::-1]] |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
93 return "".join(revseq) |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
94 pickseq = fastadict[transcript][up-1:down] |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
95 if orientation == "direct": |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
96 return pickseq |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
97 else: |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
98 return reverse(pickseq) |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
99 |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
100 def subjectCoverage (fastadict, blastdict, subject, QueriesFlankingNucleotides=0): |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
101 SubjectCoverageList = [] |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
102 HitDic = {} |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
103 bitScores = [] |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
104 for transcript in blastdict[subject]: |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
105 prefix = "%s--%s_" % (subject, transcript) |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
106 hitNumber = 0 |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
107 for hit in blastdict[subject][transcript]: |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
108 hitNumber += 1 |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
109 suffix = "hit%s_IdMatch=%s,AligLength=%s,E-val=%s" % (hitNumber, hit[0], hit[1], hit[8]) |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
110 HitDic[prefix+suffix] = GetHitSequence (fastadict, transcript, hit[4], hit[5], QueriesFlankingNucleotides) #query coverage by a hit is in hit[4:6] |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
111 SubjectCoverageList += range (min([hit[6], hit[7]]), max([hit[6], hit[7]]) + 1) # subject coverage by a hit is in hit[6:8] |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
112 bitScores.append(hit[9]) |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
113 subjectLength = hit [10] # always the same value for a given subject. Stupid but simple |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
114 TotalSubjectCoverage = len ( set (SubjectCoverageList) ) |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
115 RelativeSubjectCoverage = TotalSubjectCoverage/float(subjectLength) |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
116 return HitDic, subjectLength, TotalSubjectCoverage, RelativeSubjectCoverage, max(bitScores), mean(bitScores) |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
117 |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
118 def GetHitSequence (fastadict, FastaHeader, leftCoordinate, rightCoordinate, FlankingValue): |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
119 if rightCoordinate > leftCoordinate: |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
120 polarity = "direct" |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
121 else: |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
122 polarity = "reverse" |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
123 leftCoordinate, rightCoordinate = rightCoordinate, leftCoordinate |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
124 if leftCoordinate - FlankingValue > 0: |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
125 leftCoordinate -= FlankingValue |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
126 else: |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
127 leftCoordinate = 1 |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
128 return getseq (fastadict, FastaHeader, leftCoordinate, rightCoordinate, polarity) |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
129 |
1
1964514aabde
planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit 1cc2b50091f512593c502176619998f5908fc8e8
drosofff
parents:
0
diff
changeset
|
130 def outputParsing (F, Fasta, results, Xblastdict, fastadict, filter_relativeCov=0, filter_maxScore=0, filter_meanScore=0, mode="verbose"): |
0
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
131 F= open(F, "w") |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
132 Fasta=open(Fasta, "w") |
2
bb0d4cd765c5
planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit 6dee2ab33610e7724e9423cc09818bcbbf11ea82
drosofff
parents:
1
diff
changeset
|
133 blasted_transcripts = [] |
bb0d4cd765c5
planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit 6dee2ab33610e7724e9423cc09818bcbbf11ea82
drosofff
parents:
1
diff
changeset
|
134 for subject in results: |
bb0d4cd765c5
planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit 6dee2ab33610e7724e9423cc09818bcbbf11ea82
drosofff
parents:
1
diff
changeset
|
135 if results[subject]["RelativeSubjectCoverage"]<filter_relativeCov or results[subject]["maxBitScores"]<filter_maxScore or results[subject]["meanBitScores"]<filter_meanScore: |
bb0d4cd765c5
planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit 6dee2ab33610e7724e9423cc09818bcbbf11ea82
drosofff
parents:
1
diff
changeset
|
136 continue |
bb0d4cd765c5
planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit 6dee2ab33610e7724e9423cc09818bcbbf11ea82
drosofff
parents:
1
diff
changeset
|
137 for transcript in Xblastdict[subject]: |
bb0d4cd765c5
planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit 6dee2ab33610e7724e9423cc09818bcbbf11ea82
drosofff
parents:
1
diff
changeset
|
138 blasted_transcripts.append(transcript) |
bb0d4cd765c5
planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit 6dee2ab33610e7724e9423cc09818bcbbf11ea82
drosofff
parents:
1
diff
changeset
|
139 blasted_transcripts = list( set( blasted_transcripts)) |
0
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
140 if mode == "verbose": |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
141 print >>F, "# SeqId\t%Identity\tAlignLength\tStartSubject\tEndSubject\t%QueryHitCov\tE-value\tBitScore\n" |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
142 for subject in sorted (results, key=lambda x: results[x]["meanBitScores"], reverse=True): |
1
1964514aabde
planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit 1cc2b50091f512593c502176619998f5908fc8e8
drosofff
parents:
0
diff
changeset
|
143 if results[subject]["RelativeSubjectCoverage"]<filter_relativeCov or results[subject]["maxBitScores"]<filter_maxScore or results[subject]["meanBitScores"]<filter_meanScore: |
1964514aabde
planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit 1cc2b50091f512593c502176619998f5908fc8e8
drosofff
parents:
0
diff
changeset
|
144 continue |
0
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
145 print >> F, "#\n# %s" % subject |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
146 print >> F, "# Suject Length: %s" % (results[subject]["subjectLength"]) |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
147 print >> F, "# Total Subject Coverage: %s" % (results[subject]["TotalCoverage"]) |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
148 print >> F, "# Relative Subject Coverage: %s" % (results[subject]["RelativeSubjectCoverage"]) |
3
8f5d48294f70
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_blastparser_and_hits commit 3d9ddd0f6f3c3b97a3bebf52646731ad6771e178
drosofff
parents:
2
diff
changeset
|
149 print >> F, "# Best Bit Score: %s" % (results[subject]["maxBitScores"]) |
0
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
150 print >> F, "# Mean Bit Score: %s" % (results[subject]["meanBitScores"]) |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
151 for header in results[subject]["HitDic"]: |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
152 print >> Fasta, ">%s\n%s" % (header, insert_newlines(results[subject]["HitDic"][header]) ) |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
153 print >> Fasta, "" # final carriage return for the sequence |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
154 for transcript in Xblastdict[subject]: |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
155 transcriptSize = float(len(fastadict[transcript])) |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
156 for hit in Xblastdict[subject][transcript]: |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
157 percentIdentity, alignLenght, subjectStart, subjectEnd, queryCov = hit[0], hit[1], hit[6], hit[7], "%.1f" % (abs(hit[5]-hit[4])/transcriptSize*100) |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
158 Eval, BitScore = hit[8], hit[9] |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
159 info = [transcript] + [percentIdentity, alignLenght, subjectStart, subjectEnd, queryCov, Eval, BitScore] |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
160 info = [str(i) for i in info] |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
161 info = "\t".join(info) |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
162 print >> F, info |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
163 else: |
3
8f5d48294f70
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_blastparser_and_hits commit 3d9ddd0f6f3c3b97a3bebf52646731ad6771e178
drosofff
parents:
2
diff
changeset
|
164 print >>F, "# subject\tsubject length\tTotal Subject Coverage\tRelative Subject Coverage\tBest Bit Score\tMean Bit Score" |
0
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
165 for subject in sorted (results, key=lambda x: results[x]["meanBitScores"], reverse=True): |
1
1964514aabde
planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit 1cc2b50091f512593c502176619998f5908fc8e8
drosofff
parents:
0
diff
changeset
|
166 if results[subject]["RelativeSubjectCoverage"]<filter_relativeCov or results[subject]["maxBitScores"]<filter_maxScore or results[subject]["meanBitScores"]<filter_meanScore: |
1964514aabde
planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit 1cc2b50091f512593c502176619998f5908fc8e8
drosofff
parents:
0
diff
changeset
|
167 continue |
0
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
168 line = [] |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
169 line.append(subject) |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
170 line.append(results[subject]["subjectLength"]) |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
171 line.append(results[subject]["TotalCoverage"]) |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
172 line.append(results[subject]["RelativeSubjectCoverage"]) |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
173 line.append(results[subject]["maxBitScores"]) |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
174 line.append(results[subject]["meanBitScores"]) |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
175 line = [str(i) for i in line] |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
176 print >> F, "\t".join(line) |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
177 for header in results[subject]["HitDic"]: |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
178 print >> Fasta, ">%s\n%s" % (header, insert_newlines(results[subject]["HitDic"][header]) ) |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
179 print >> Fasta, "" # final carriage return for the sequence |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
180 F.close() |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
181 Fasta.close() |
2
bb0d4cd765c5
planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit 6dee2ab33610e7724e9423cc09818bcbbf11ea82
drosofff
parents:
1
diff
changeset
|
182 return blasted_transcripts |
0
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
183 |
2
bb0d4cd765c5
planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit 6dee2ab33610e7724e9423cc09818bcbbf11ea82
drosofff
parents:
1
diff
changeset
|
184 def dispatch_sequences (fastadict, blasted_transcripts, matched_sequences, unmatched_sequences): |
1
1964514aabde
planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit 1cc2b50091f512593c502176619998f5908fc8e8
drosofff
parents:
0
diff
changeset
|
185 '''to output the sequences that matched and did not matched in the blast''' |
1964514aabde
planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit 1cc2b50091f512593c502176619998f5908fc8e8
drosofff
parents:
0
diff
changeset
|
186 F_matched = open (matched_sequences, "w") |
1964514aabde
planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit 1cc2b50091f512593c502176619998f5908fc8e8
drosofff
parents:
0
diff
changeset
|
187 F_unmatched = open (unmatched_sequences, "w") |
1964514aabde
planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit 1cc2b50091f512593c502176619998f5908fc8e8
drosofff
parents:
0
diff
changeset
|
188 for transcript in fastadict: |
2
bb0d4cd765c5
planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit 6dee2ab33610e7724e9423cc09818bcbbf11ea82
drosofff
parents:
1
diff
changeset
|
189 if transcript in blasted_transcripts: # le list of blasted_transcripts is generated by the outputParsing function |
1
1964514aabde
planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit 1cc2b50091f512593c502176619998f5908fc8e8
drosofff
parents:
0
diff
changeset
|
190 print >> F_matched, ">%s\n%s" % (transcript, insert_newlines(fastadict[transcript]) ) |
1964514aabde
planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit 1cc2b50091f512593c502176619998f5908fc8e8
drosofff
parents:
0
diff
changeset
|
191 else: |
1964514aabde
planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit 1cc2b50091f512593c502176619998f5908fc8e8
drosofff
parents:
0
diff
changeset
|
192 print >> F_unmatched, ">%s\n%s" % (transcript, insert_newlines(fastadict[transcript]) ) |
1964514aabde
planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit 1cc2b50091f512593c502176619998f5908fc8e8
drosofff
parents:
0
diff
changeset
|
193 F_matched.close() |
1964514aabde
planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit 1cc2b50091f512593c502176619998f5908fc8e8
drosofff
parents:
0
diff
changeset
|
194 F_unmatched.close() |
1964514aabde
planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit 1cc2b50091f512593c502176619998f5908fc8e8
drosofff
parents:
0
diff
changeset
|
195 return |
0
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
196 |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
197 def __main__ (): |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
198 args = Parser() |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
199 fastadict = getfasta (args.sequences) |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
200 Xblastdict = getblast (args.blast) |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
201 results = defaultdict(dict) |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
202 for subject in Xblastdict: |
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
203 results[subject]["HitDic"], results[subject]["subjectLength"], results[subject]["TotalCoverage"], results[subject]["RelativeSubjectCoverage"], results[subject]["maxBitScores"], results[subject]["meanBitScores"] = subjectCoverage(fastadict, Xblastdict, subject, args.flanking) |
2
bb0d4cd765c5
planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit 6dee2ab33610e7724e9423cc09818bcbbf11ea82
drosofff
parents:
1
diff
changeset
|
204 blasted_transcripts = outputParsing (args.tabularOutput, args.fastaOutput, results, Xblastdict, fastadict, |
bb0d4cd765c5
planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit 6dee2ab33610e7724e9423cc09818bcbbf11ea82
drosofff
parents:
1
diff
changeset
|
205 filter_relativeCov=args.filter_relativeCov, filter_maxScore=args.filter_maxScore, |
bb0d4cd765c5
planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit 6dee2ab33610e7724e9423cc09818bcbbf11ea82
drosofff
parents:
1
diff
changeset
|
206 filter_meanScore=args.filter_meanScore, mode=args.mode) |
bb0d4cd765c5
planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit 6dee2ab33610e7724e9423cc09818bcbbf11ea82
drosofff
parents:
1
diff
changeset
|
207 dispatch_sequences (fastadict, blasted_transcripts, args.al_sequences, args.un_sequences) |
bb0d4cd765c5
planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit 6dee2ab33610e7724e9423cc09818bcbbf11ea82
drosofff
parents:
1
diff
changeset
|
208 |
0
69ea2a13947f
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
209 if __name__=="__main__": __main__() |