Mercurial > repos > iuc > virannot_otu
comparison blast2tsv.py @ 0:c9dac9b2e01c draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/virAnnot commit 3a3b40c15ae5e82334f016e88b1f3c5bbbb3b2cd
| author | iuc |
|---|---|
| date | Mon, 04 Mar 2024 19:56:40 +0000 |
| parents | |
| children | 735a21808348 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:c9dac9b2e01c |
|---|---|
| 1 #!/usr/bin/env python3 | |
| 2 | |
| 3 | |
| 4 # Name: blast2tsv | |
| 5 # Author(s): Sebastien Theil, Marie Lefebvre - INRAE | |
| 6 # Aims: Convert blast xml output to tsv and add taxonomy | |
| 7 | |
| 8 | |
| 9 import argparse | |
| 10 import csv | |
| 11 import logging as log | |
| 12 import os | |
| 13 | |
| 14 from Bio import Entrez | |
| 15 from Bio import SeqIO | |
| 16 from Bio.Blast import NCBIXML | |
| 17 from ete3 import NCBITaxa | |
| 18 | |
| 19 ncbi = NCBITaxa() | |
| 20 | |
| 21 | |
| 22 def main(): | |
| 23 options = _set_options() | |
| 24 _set_log_level(options.verbosity) | |
| 25 hits = _read_xml(options) | |
| 26 _write_tsv(options, hits) | |
| 27 | |
| 28 | |
| 29 def _guess_database(accession): | |
| 30 """Guess the correct database for querying based off the format of the accession""" | |
| 31 database_mappings_refseq = {'AC_': 'nuccore', 'NC_': 'nuccore', 'NG_': 'nuccore', | |
| 32 'NT_': 'nuccore', 'NW_': 'nuccore', 'NZ_': 'nuccore', | |
| 33 'AP_': 'protein', 'NP_': 'protein', 'YP_': 'protein', | |
| 34 'XP_': 'protein', 'WP_': 'protein'} | |
| 35 return database_mappings_refseq[accession[0:3]] | |
| 36 | |
| 37 | |
| 38 def _read_xml(options): | |
| 39 """ | |
| 40 Parse XML blast results file | |
| 41 Keep only the first hit | |
| 42 """ | |
| 43 log.info("Read XML file.") | |
| 44 results = open(options.xml_file, 'r') | |
| 45 records = NCBIXML.parse(results) | |
| 46 xml_results = {} | |
| 47 for blast_record in records: | |
| 48 for aln in blast_record.alignments: | |
| 49 hit_count = 1 | |
| 50 for hit in aln.hsps: | |
| 51 hsp = {} | |
| 52 if hit_count == 1: | |
| 53 first_hit_frame = hit.frame[1] if len(hit.frame) > 0 else 0 # strand | |
| 54 cumul_hit_identity = hit.identities if hit.identities else 0 | |
| 55 cumul_hit_score = hit.bits # hit score | |
| 56 cumul_hit_evalue = hit.expect # evalue | |
| 57 cumul_hit_length = hit.align_length if hit.align_length is not None else 0 | |
| 58 hit_count = hit_count + 1 | |
| 59 else: | |
| 60 # all HSPs in different strand than 1st HSPs will be discarded. | |
| 61 if (first_hit_frame > 0 and hit.frame[1] > 0) or (first_hit_frame < 0 and hit.frame[1] < 0): | |
| 62 cumul_hit_identity = cumul_hit_identity + hit.identities | |
| 63 cumul_hit_length = cumul_hit_length + hit.align_length | |
| 64 cumul_hit_evalue = cumul_hit_evalue + hit.expect | |
| 65 cumul_hit_score = cumul_hit_score + hit.bits | |
| 66 hit_count = hit_count + 1 | |
| 67 if hit_count == 1: | |
| 68 final_hit_count = hit_count | |
| 69 elif hit_count > 1: | |
| 70 final_hit_count = hit_count - 1 | |
| 71 hsp["evalue"] = cumul_hit_evalue / final_hit_count # The smaller the E-value, the better the match | |
| 72 hsp["query_id"] = blast_record.query_id | |
| 73 hsp["query_length"] = blast_record.query_length # length of the query | |
| 74 hsp["accession"] = aln.accession.replace("ref|", "") | |
| 75 hsp["description"] = aln.hit_def | |
| 76 hsp["hit_length"] = aln.length # length of the hit | |
| 77 hsp["hsp_length"] = hit.align_length # length of the hsp alignment | |
| 78 hsp["queryOverlap"] = _get_overlap_value(options.algo, hsp, 'hsp', hsp["query_length"])[0] | |
| 79 if cumul_hit_length == 0: | |
| 80 hsp["percentIdentity"] = round(cumul_hit_identity, 1) # identity percentage | |
| 81 else: | |
| 82 hsp["percentIdentity"] = round(cumul_hit_identity / cumul_hit_length * 100, 1) # identity percentage | |
| 83 hsp["score"] = cumul_hit_score # The higher the bit-score, the better the sequence similarity | |
| 84 hsp["num_hsps"] = final_hit_count | |
| 85 hsp["hit_cumul_length"] = cumul_hit_length | |
| 86 hsp["hitOverlap"] = _get_overlap_value(options.algo, hsp, 'hit', hsp["query_length"])[1] | |
| 87 db = _guess_database(hsp["accession"]) | |
| 88 try: | |
| 89 handle = Entrez.esummary(db=db, id=hsp["accession"]) | |
| 90 taxid = str(int(Entrez.read(handle)[0]['TaxId'])) | |
| 91 handle.close() | |
| 92 log.info("Taxid found for " + hsp["accession"]) | |
| 93 lineage = ncbi.get_lineage(taxid) | |
| 94 names = ncbi.get_taxid_translator(lineage) | |
| 95 ordered = [names[tid] for tid in lineage] | |
| 96 taxonomy = ordered[1:] | |
| 97 hsp["tax_id"] = taxid | |
| 98 hsp["taxonomy"] = ';'.join(taxonomy) | |
| 99 hsp["organism"] = taxonomy[-1] | |
| 100 except RuntimeError: | |
| 101 hsp["tax_id"] = "" | |
| 102 hsp["taxonomy"] = "" | |
| 103 hsp["organism"] = "" | |
| 104 log.warning("RuntimeError - Taxid not found for " + hsp["accession"]) | |
| 105 if hsp["evalue"] <= options.max_evalue and hsp["queryOverlap"] >= options.min_qov and \ | |
| 106 hsp["hitOverlap"] >= options.min_hov and hsp["score"] >= options.min_score: | |
| 107 xml_results[hsp["query_id"]] = hsp | |
| 108 else: | |
| 109 xml_results[hsp["query_id"]] = [hsp["query_length"]] | |
| 110 | |
| 111 return xml_results | |
| 112 | |
| 113 | |
| 114 def _get_overlap_value(algo, hsp, type, qlength): | |
| 115 """ | |
| 116 Set hsp or hit overlap values for hit and query | |
| 117 Return array [query_overlap, hit_overlap] | |
| 118 """ | |
| 119 if type == 'hsp': | |
| 120 q_align_len = qlength | |
| 121 h_align_len = hsp["hsp_length"] | |
| 122 else: | |
| 123 q_align_len = qlength | |
| 124 h_align_len = hsp["hit_cumul_length"] | |
| 125 | |
| 126 if algo == 'BLASTX': | |
| 127 if q_align_len: | |
| 128 query_overlap = (q_align_len * 3 / q_align_len) * 100 | |
| 129 if hsp["hit_length"]: | |
| 130 hit_overlap = (h_align_len / hsp["hit_length"]) * 100 | |
| 131 elif algo == 'TBLASTN': | |
| 132 if q_align_len: | |
| 133 query_overlap = (q_align_len / q_align_len) * 100 | |
| 134 if hsp["hit_length"]: | |
| 135 hit_overlap = (h_align_len * 3 / hsp["hit_length"]) * 100 | |
| 136 elif algo == 'TBLASTX': | |
| 137 if q_align_len: | |
| 138 query_overlap = (q_align_len * 3 / hsp["hsp_length"]) * 100 | |
| 139 if hsp["hit_length"]: | |
| 140 hit_overlap = (h_align_len * 3 / hsp["hit_length"]) * 100 | |
| 141 else: | |
| 142 if q_align_len: | |
| 143 query_overlap = (q_align_len / q_align_len) * 100 | |
| 144 if hsp["hit_length"]: | |
| 145 hit_overlap = (h_align_len / hsp["hit_length"]) * 100 | |
| 146 if query_overlap is None: | |
| 147 query_overlap = 0 | |
| 148 if query_overlap > 100: | |
| 149 query_overlap = 100 | |
| 150 if 'hit_overlap' not in locals(): | |
| 151 hit_overlap = 0 | |
| 152 if hit_overlap > 100: | |
| 153 hit_overlap = 100 | |
| 154 | |
| 155 return [round(query_overlap, 0), round(hit_overlap, 0)] | |
| 156 | |
| 157 | |
| 158 def _write_tsv(options, hits): | |
| 159 """ | |
| 160 Write output | |
| 161 """ | |
| 162 # get a list of contig without corresponding number of mapped reads | |
| 163 if options.rn_file is not None: | |
| 164 with open(options.rn_file) as rn: | |
| 165 rows = (line.split('\t') for line in rn) | |
| 166 rn_list = {row[0]: row[1:] for row in rows} | |
| 167 fasta = SeqIO.to_dict(SeqIO.parse(open(options.fasta_file), 'fasta')) | |
| 168 headers = "#algo\tquery_id\tnb_reads\tquery_length\taccession\tdescription\torganism\tpercentIdentity\tnb_hsps\tqueryOverlap\thitOverlap\tevalue\tscore\ttax_id\ttaxonomy\tsequence\n" | |
| 169 if not os.path.exists(options.output): | |
| 170 os.mkdir(options.output) | |
| 171 tsv_file = options.output + "/blast2tsv_output.tab" | |
| 172 log.info("Write output file: " + tsv_file) | |
| 173 f = open(tsv_file, "w+") | |
| 174 f.write(headers) | |
| 175 for h in hits: | |
| 176 if options.rn_file is not None: | |
| 177 read_nb = ''.join(rn_list[h]).replace("\n", "") | |
| 178 else: | |
| 179 read_nb = '' | |
| 180 if len(hits[h]) > 1: | |
| 181 f.write(options.algo + "\t" + h + "\t" + read_nb + "\t" + str(hits[h]["query_length"]) + "\t") | |
| 182 f.write(hits[h]["accession"] + "\t" + hits[h]["description"] + "\t") | |
| 183 f.write(hits[h]["organism"] + "\t" + str(hits[h]["percentIdentity"]) + "\t") | |
| 184 f.write(str(hits[h]["num_hsps"]) + "\t" + str(hits[h]["queryOverlap"]) + "\t") | |
| 185 f.write(str(hits[h]["hitOverlap"]) + "\t" + str(hits[h]["evalue"]) + "\t") | |
| 186 f.write(str(hits[h]["score"]) + "\t" + str(hits[h]["tax_id"]) + "\t") | |
| 187 if h in fasta: | |
| 188 f.write(hits[h]["taxonomy"] + "\t" + str(fasta[h].seq)) | |
| 189 else: | |
| 190 f.write(hits[h]["taxonomy"] + "\t\"\"") | |
| 191 f.write("\n") | |
| 192 else: | |
| 193 f.write(options.algo + "\t" + h + "\t" + read_nb + "\t" + str(hits[h])[1:-1] + "\t") | |
| 194 f.write("\n") | |
| 195 f.close() | |
| 196 _create_abundance(options, tsv_file) | |
| 197 | |
| 198 | |
| 199 def _create_abundance(options, tsv_file): | |
| 200 """ | |
| 201 extract values from tsv files | |
| 202 and create abundance files | |
| 203 """ | |
| 204 log.info("Calculating abundance.") | |
| 205 file_path = tsv_file | |
| 206 abundance = dict() | |
| 207 with open(tsv_file, 'r') as current_file: | |
| 208 log.debug("Reading " + file_path) | |
| 209 csv_reader = csv.reader(current_file, delimiter='\t') | |
| 210 line_count = 0 | |
| 211 for row in csv_reader: | |
| 212 if line_count == 0: | |
| 213 # headers | |
| 214 line_count += 1 | |
| 215 else: | |
| 216 # no annotation | |
| 217 if len(row) == 16: | |
| 218 if row[14] != "": | |
| 219 nb_reads = row[2] | |
| 220 if nb_reads == "": | |
| 221 current_reads_nb = 0 | |
| 222 log.debug("No reads number for " + row[1]) | |
| 223 else: | |
| 224 current_reads_nb = int(nb_reads) | |
| 225 contig_id = row[14] | |
| 226 if contig_id in abundance: | |
| 227 # add reads | |
| 228 abundance[contig_id]["reads_nb"] = abundance[row[14]]["reads_nb"] + current_reads_nb | |
| 229 abundance[contig_id]["contigs_nb"] = abundance[row[14]]["contigs_nb"] + 1 | |
| 230 else: | |
| 231 # init reads for this taxo | |
| 232 abundance[contig_id] = {} | |
| 233 abundance[contig_id]["reads_nb"] = current_reads_nb | |
| 234 abundance[contig_id]["contigs_nb"] = 1 | |
| 235 else: | |
| 236 log.debug("No annotations for contig " + row[1]) | |
| 237 else: | |
| 238 log.debug("No annotations for contig " + row[1]) | |
| 239 log.debug(abundance) | |
| 240 reads_file = open(options.output + "/blast2tsv_reads.txt", "w+") | |
| 241 for taxo in abundance: | |
| 242 reads_file.write(str(abundance[taxo]["reads_nb"])) | |
| 243 reads_file.write("\t") | |
| 244 reads_file.write("\t".join(taxo.split(";"))) | |
| 245 reads_file.write("\n") | |
| 246 reads_file.close() | |
| 247 log.info("Abundance file created " + options.output + "/blast2tsv_reads.txt") | |
| 248 contigs_file = open(options.output + "/blast2tsv_contigs.txt", "w+") | |
| 249 for taxo in abundance: | |
| 250 contigs_file.write(str(abundance[taxo]["contigs_nb"])) | |
| 251 contigs_file.write("\t") | |
| 252 contigs_file.write("\t".join(taxo.split(";"))) | |
| 253 contigs_file.write("\n") | |
| 254 contigs_file.close() | |
| 255 log.info("Abundance file created " + options.output + "/blast2tsv_contigs.txt") | |
| 256 | |
| 257 | |
| 258 def _set_options(): | |
| 259 parser = argparse.ArgumentParser() | |
| 260 parser.add_argument('-x', '--xml', help='XML files with results of blast', action='store', required=True, dest='xml_file') | |
| 261 parser.add_argument('-rn', '--read-count', help='Tab-delimited file associating seqID with read number.', action='store', dest='rn_file') | |
| 262 parser.add_argument('-c', '--contigs', help='FASTA file with contigs sequence.', action='store', required=True, dest='fasta_file') | |
| 263 parser.add_argument('-me', '--max_evalue', help='Max evalue', action='store', type=float, default=0.0001, dest='max_evalue') | |
| 264 parser.add_argument('-qov', '--min_query_overlap', help='Minimum query overlap', action='store', type=int, default=5, dest='min_qov') | |
| 265 parser.add_argument('-mhov', '--min_hit_overlap', help='Minimum hit overlap', action='store', type=int, default=5, dest='min_hov') | |
| 266 parser.add_argument('-s', '--min_score', help='Minimum score', action='store', type=int, default=30, dest='min_score') | |
| 267 parser.add_argument('-a', '--algo', help='Blast type detection (BLASTN|BLASTP|BLASTX|TBLASTX|TBLASTN|DIAMONDX).', action='store', type=str, default='BLASTX', dest='algo') | |
| 268 parser.add_argument('-o', '--out', help='The output file (.csv).', action='store', type=str, default='./blast2tsv', dest='output') | |
| 269 parser.add_argument('-v', '--verbosity', help='Verbose level', action='store', type=int, choices=[1, 2, 3, 4], default=1) | |
| 270 args = parser.parse_args() | |
| 271 return args | |
| 272 | |
| 273 | |
| 274 def _set_log_level(verbosity): | |
| 275 if verbosity == 1: | |
| 276 log_format = '%(asctime)s %(levelname)-8s %(message)s' | |
| 277 log.basicConfig(level=log.INFO, format=log_format) | |
| 278 elif verbosity == 3: | |
| 279 log_format = '%(filename)s:%(lineno)s - %(asctime)s %(levelname)-8s %(message)s' | |
| 280 log.basicConfig(level=log.DEBUG, format=log_format) | |
| 281 | |
| 282 | |
| 283 if __name__ == "__main__": | |
| 284 main() |
