Mercurial > repos > earlhaminst > blast_parser
comparison blast_parser.py @ 3:70df762b48a8 draft
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
| author | earlhaminst |
|---|---|
| date | Tue, 03 Oct 2017 04:51:45 -0400 |
| parents | |
| children | 363f3480622d |
comparison
equal
deleted
inserted
replaced
| 2:376ed15e0d27 | 3:70df762b48a8 |
|---|---|
| 1 """ | |
| 2 Simple parser to convert a BLAST 12-column or 24-column tabular output into a | |
| 3 3-column tabular input for hcluster_hg (id1, id2, weight): | |
| 4 """ | |
| 5 import argparse | |
| 6 import math | |
| 7 from collections import OrderedDict | |
| 8 | |
| 9 | |
| 10 def main(): | |
| 11 parser = argparse.ArgumentParser() | |
| 12 | |
| 13 parser.add_argument('-i', metavar='in-file', type=argparse.FileType('rt'), required=True, help='Path to input file') | |
| 14 | |
| 15 parser.add_argument('-o', metavar='out-file', type=argparse.FileType('wt'), required=True, help='Path to output file') | |
| 16 | |
| 17 parser.add_argument('-r', action='store_true', default=False, | |
| 18 dest='reciprocal', | |
| 19 help='Annotate homolog pair') | |
| 20 | |
| 21 parser.add_argument('--version', action='version', version='%(prog)s 1.0') | |
| 22 | |
| 23 options = parser.parse_args() | |
| 24 | |
| 25 results = OrderedDict() | |
| 26 | |
| 27 for line in options.i: | |
| 28 line = line.rstrip() | |
| 29 line_cols = line.split('\t') | |
| 30 sequence1_id = line_cols[0] | |
| 31 sequence2_id = line_cols[1] | |
| 32 evalue = float(line_cols[10]) | |
| 33 | |
| 34 # Ignore self-matching hits | |
| 35 if sequence1_id != sequence2_id: | |
| 36 # Convert evalue to an integer weight with max 100 | |
| 37 weight = 100 | |
| 38 | |
| 39 # If the evalue is 0, leave weight at 100 | |
| 40 if evalue != 0.0: | |
| 41 weight = min(100, round(math.log10(evalue) / -2.0)) | |
| 42 | |
| 43 if (sequence1_id, sequence2_id) not in results: | |
| 44 results[(sequence1_id, sequence2_id)] = weight | |
| 45 else: | |
| 46 results[(sequence1_id, sequence2_id)] = max(results[(sequence1_id, sequence2_id)], weight) | |
| 47 | |
| 48 for (sequence1_id, sequence2_id), weight in results.items(): | |
| 49 if not options.reciprocal or (sequence2_id, sequence1_id) in results: | |
| 50 options.o.write("%s\t%s\t%d\n" % (sequence1_id, sequence2_id, weight)) | |
| 51 | |
| 52 | |
| 53 if __name__ == "__main__": | |
| 54 main() |
