Mercurial > repos > earlhaminst > blast_parser
comparison blast_parser.py @ 4:363f3480622d draft
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 598e396ffaf5834a1c8f32868b1568a2b48a0a78-dirty
| author | earlhaminst | 
|---|---|
| date | Thu, 12 Oct 2017 07:25:29 -0400 | 
| parents | 70df762b48a8 | 
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| 3:70df762b48a8 | 4:363f3480622d | 
|---|---|
| 2 Simple parser to convert a BLAST 12-column or 24-column tabular output into a | 2 Simple parser to convert a BLAST 12-column or 24-column tabular output into a | 
| 3 3-column tabular input for hcluster_hg (id1, id2, weight): | 3 3-column tabular input for hcluster_hg (id1, id2, weight): | 
| 4 """ | 4 """ | 
| 5 import argparse | 5 import argparse | 
| 6 import math | 6 import math | 
| 7 from collections import OrderedDict | 7 import sqlite3 | 
| 8 import tempfile | |
| 9 | |
| 10 BATCH_SIZE = 2000 | |
| 11 | |
| 12 | |
| 13 def create_tables(conn): | |
| 14 cur = conn.cursor() | |
| 15 cur.execute('''CREATE TABLE alignment ( | |
| 16 id INTEGER PRIMARY KEY, | |
| 17 sequence1_id VARCHAR NOT NULL, | |
| 18 sequence2_id VARCHAR NOT NULL, | |
| 19 weight INTEGER NOT NULL)''') | |
| 20 conn.commit() | |
| 8 | 21 | 
| 9 | 22 | 
| 10 def main(): | 23 def main(): | 
| 11 parser = argparse.ArgumentParser() | 24 parser = argparse.ArgumentParser() | 
| 12 | |
| 13 parser.add_argument('-i', metavar='in-file', type=argparse.FileType('rt'), required=True, help='Path to input file') | 25 parser.add_argument('-i', metavar='in-file', type=argparse.FileType('rt'), required=True, help='Path to input file') | 
| 14 | |
| 15 parser.add_argument('-o', metavar='out-file', type=argparse.FileType('wt'), required=True, help='Path to output file') | 26 parser.add_argument('-o', metavar='out-file', type=argparse.FileType('wt'), required=True, help='Path to output file') | 
| 16 | |
| 17 parser.add_argument('-r', action='store_true', default=False, | 27 parser.add_argument('-r', action='store_true', default=False, | 
| 18 dest='reciprocal', | 28 dest='reciprocal', | 
| 19 help='Annotate homolog pair') | 29 help='Annotate homolog pair') | 
| 20 | |
| 21 parser.add_argument('--version', action='version', version='%(prog)s 1.0') | |
| 22 | |
| 23 options = parser.parse_args() | 30 options = parser.parse_args() | 
| 24 | 31 | 
| 25 results = OrderedDict() | 32 db_file = tempfile.NamedTemporaryFile(suffix=".sqlite") | 
| 26 | 33 | 
| 34 conn = sqlite3.connect(db_file.name) | |
| 35 conn.execute('PRAGMA foreign_keys = ON') | |
| 36 | |
| 37 create_tables(conn) | |
| 38 | |
| 39 cur = conn.cursor() | |
| 40 | |
| 41 i = 0 | |
| 27 for line in options.i: | 42 for line in options.i: | 
| 28 line = line.rstrip() | 43 line = line.rstrip() | 
| 29 line_cols = line.split('\t') | 44 line_cols = line.split('\t') | 
| 30 sequence1_id = line_cols[0] | 45 sequence1_id = line_cols[0] | 
| 31 sequence2_id = line_cols[1] | 46 sequence2_id = line_cols[1] | 
| 47 | |
| 48 # Ignore self-matching hits | |
| 49 if sequence1_id == sequence2_id: | |
| 50 continue | |
| 51 | |
| 52 i = i + 1 | |
| 32 evalue = float(line_cols[10]) | 53 evalue = float(line_cols[10]) | 
| 33 | 54 | 
| 34 # Ignore self-matching hits | 55 # Convert evalue to an integer weight with max 100 | 
| 35 if sequence1_id != sequence2_id: | 56 if evalue != 0.0: | 
| 36 # Convert evalue to an integer weight with max 100 | 57 weight = min(100, round(math.log10(evalue) / -2.0)) | 
| 58 else: | |
| 59 # If the evalue is 0, leave weight at 100 | |
| 37 weight = 100 | 60 weight = 100 | 
| 38 | 61 | 
| 39 # If the evalue is 0, leave weight at 100 | 62 cur.execute('INSERT INTO alignment (id, sequence1_id, sequence2_id, weight) VALUES (?, ?, ?, ?)', | 
| 40 if evalue != 0.0: | 63 (i, sequence1_id, sequence2_id, weight)) | 
| 41 weight = min(100, round(math.log10(evalue) / -2.0)) | |
| 42 | 64 | 
| 43 if (sequence1_id, sequence2_id) not in results: | 65 # Commit transaction at every BATCH_SIZE rows to save memory | 
| 44 results[(sequence1_id, sequence2_id)] = weight | 66 if i % BATCH_SIZE == 0: | 
| 45 else: | 67 conn.commit() | 
| 46 results[(sequence1_id, sequence2_id)] = max(results[(sequence1_id, sequence2_id)], weight) | 68 # Commit final transaction | 
| 69 conn.commit() | |
| 70 options.i.close() | |
| 47 | 71 | 
| 48 for (sequence1_id, sequence2_id), weight in results.items(): | 72 # Delete alternative alignments keeping only one with max weight | 
| 49 if not options.reciprocal or (sequence2_id, sequence1_id) in results: | 73 cur.execute('DELETE FROM alignment WHERE id NOT IN (SELECT id FROM alignment GROUP BY sequence1_id, sequence2_id HAVING weight=max(weight))') | 
| 50 options.o.write("%s\t%s\t%d\n" % (sequence1_id, sequence2_id, weight)) | 74 conn.commit() | 
| 75 | |
| 76 if options.reciprocal: | |
| 77 query = 'SELECT a1.sequence1_id, a1.sequence2_id, a1.weight FROM alignment a1, alignment a2 WHERE a1.sequence1_id = a2.sequence2_id AND a1.sequence2_id = a2.sequence1_id ORDER BY a1.id' | |
| 78 else: | |
| 79 query = 'SELECT sequence1_id, sequence2_id, weight FROM alignment ORDER BY id' | |
| 80 | |
| 81 cur.execute(query) | |
| 82 while True: | |
| 83 rows = cur.fetchmany(BATCH_SIZE) | |
| 84 if not rows: | |
| 85 break | |
| 86 for row in rows: | |
| 87 options.o.write("%s\t%s\t%d\n" % row) | |
| 88 | |
| 89 conn.close() | |
| 90 db_file.close() | |
| 91 options.o.close() | |
| 51 | 92 | 
| 52 | 93 | 
| 53 if __name__ == "__main__": | 94 if __name__ == "__main__": | 
| 54 main() | 95 main() | 
