Mercurial > repos > artbio > blast_unmatched
changeset 0:f3b63b59a1ea draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/blast_unmatched commit 5bd801feb838592fbb1f6dd68b5f1a480042da40
author | artbio |
---|---|
date | Tue, 03 Oct 2017 07:19:17 -0400 |
parents | |
children | 50c1fa95a076 |
files | blast_unmatched.py blast_unmatched.xml test-data/test_blast.tab test-data/test_output.fa test-data/test_query.fa |
diffstat | 5 files changed, 167 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/blast_unmatched.py Tue Oct 03 07:19:17 2017 -0400 @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 + +import optparse + + +def parse_options(): + """ + Parse the options guiven to the script + """ + parser = optparse.OptionParser(description='Get unmatched blast queries') + parser.add_option('-f','--fasta', dest='fasta_file', help='Query fasta file\ +used during blast') + parser.add_option('-b','--blast', dest='blast_file', help='Blast tabular\ +output (queries in 1rst column)') + parser.add_option('-o','--output', dest='output_file', help='Output file name') + (options, args) = parser.parse_args() + if len(args) > 0: + parser.error('Wrong number of arguments') + return options + +def get_matched(blast_file): + """ + Get a dictionary of all the queries that got a match + """ + matched = dict() + blast_file_handle = open(blast_file, 'r') + for line in blast_file_handle.readlines(): + fields = line.split("\t") + query_id = fields[0] + matched[query_id] = 1 + blast_file_handle.close() + return matched + +def get_unmatched(output_file, fasta_file, matched): + """ + Compares matched queries to query fasta file and print unmatched to ouput + """ + output_file_handle = open(output_file, 'w') + fasta_file_handle = open(fasta_file, 'r') + unmatched = False + for line in fasta_file_handle.readlines(): + if line.startswith('>'): + subline = line[1:100].rstrip() #qid are 100chars long in blast + if subline not in matched: + output_file_handle.write(line) + unmatched = True + else: + unmatched = False + elif unmatched: + output_file_handle.write(line) + fasta_file_handle.close() + output_file_handle.close() + +def __main__(): + opts = parse_options() + matched = get_matched(opts.blast_file) + get_unmatched(opts.output_file, opts.fasta_file, matched) + +if __main__(): + __main__()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/blast_unmatched.xml Tue Oct 03 07:19:17 2017 -0400 @@ -0,0 +1,35 @@ +<tool id="blast_unmatched" name="Blast Unmatched" version="0.1.0"> + <description>get query sequences that didn't get a match during a blast</description> + <requirements> + </requirements> + <command detect_errors="exit_code"><![CDATA[ + python '$__tool_directory__'/blast_unmatched.py + --fasta $fasta_file + --blast $blast_file + --output $output_file + ]]></command> + <inputs> + <param type="data" name="fasta_file" format="fasta"/> + <param type="data" name="blast_file" format="tabular"/> + </inputs> + <outputs> + <data name="output_file" format="fasta" label="Unmatched queries from blast: ${blast_file.name}"/> + </outputs> + <tests> + <test> + <param name="blast_file" value="test_blast.tab"/> + <param name="fasta_file" value="test_query.fa"/> + <output name="output_file" ftype="fasta" file="test_output.fa"/> + </test> + </tests> + <help><![CDATA[ +**What it does** + +This tool takes a `blast`_ output in tabular format(with the query id in 1rst column) and the fasta file used as query. It then return the query sequences that remained unmatched during the blast. + +.. _blast: https://blast.ncbi.nlm.nih.gov/Blast.cgi + + ]]></help> + <citations> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_blast.tab Tue Oct 03 07:19:17 2017 -0400 @@ -0,0 +1,3 @@ +kp289641.1_coxsackievirus_a6_isolate_cv-a6/p395/2013/china_capsid_protein,_p2_protein,_and_rna-depe kr815992.1_coxsackievirus_a6_isolate_12743/gz/chn/2013,_complete_genome 96.18 1386 53 0 2354 3739 5952 7337 0.0 2260 +kp289641.1_coxsackievirus_a6_isolate_cv-a6/p395/2013/china_capsid_protein,_p2_protein,_and_rna-depe kr815992.1_coxsackievirus_a6_isolate_12743/gz/chn/2013,_complete_genome 96.94 915 28 0 501 1415 2430 3344 0.0 1525 +kp289641.1_coxsackievirus_a6_isolate_cv-a6/p395/2013/china_capsid_protein,_p2_protein,_and_rna-depe kr815992.1_coxsackievirus_a6_isolate_12743/gz/chn/2013,_complete_genome 95.25 737 35 0 1516 2252 3747 4483 0.0 1171
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_output.fa Tue Oct 03 07:19:17 2017 -0400 @@ -0,0 +1,7 @@ +>kc879361.1_coxsackievirus_a16_strain_34351_cva16_rus_2009_2c_protein_gene,_partial_cds +CCACTCTACGCGACTGAAGCCAAAAGAGTTTATGCTCTAGAGAAAAGGATGAACAACTACATGCAGTTCA +AGAGCAAACACCGTATTGAACCTGTATGTCTGATCATTAGAGGCTCTCCAGGAACAGGTAAGTCACTTGC +TACGGGCATCATAGCTAGAGCTATTGCAGACAAATATCATTCTAGTGTTTATTCACTCCCTCCAGACCCA +GACCATTTTGATGGGTACAAGCAACAAGTAGTCACTGTTATGGATGACCTCTGCCAAAATCCAGATGGGA +AGGACATGTCTCTATTTTGTCAAATGGTTTCTACAGTCGACTTTATACCACCCATGGCATCACTGGAAGA +GAAAGGAGTGTCTTTCACCTCTAAGTTCGTCATAGCATCAACCAATGCTAGTAATATTGTAGTTCCT
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_query.fa Tue Oct 03 07:19:17 2017 -0400 @@ -0,0 +1,62 @@ +>kc879361.1_coxsackievirus_a16_strain_34351_cva16_rus_2009_2c_protein_gene,_partial_cds +CCACTCTACGCGACTGAAGCCAAAAGAGTTTATGCTCTAGAGAAAAGGATGAACAACTACATGCAGTTCA +AGAGCAAACACCGTATTGAACCTGTATGTCTGATCATTAGAGGCTCTCCAGGAACAGGTAAGTCACTTGC +TACGGGCATCATAGCTAGAGCTATTGCAGACAAATATCATTCTAGTGTTTATTCACTCCCTCCAGACCCA +GACCATTTTGATGGGTACAAGCAACAAGTAGTCACTGTTATGGATGACCTCTGCCAAAATCCAGATGGGA +AGGACATGTCTCTATTTTGTCAAATGGTTTCTACAGTCGACTTTATACCACCCATGGCATCACTGGAAGA +GAAAGGAGTGTCTTTCACCTCTAAGTTCGTCATAGCATCAACCAATGCTAGTAATATTGTAGTTCCT +>kp289641.1_coxsackievirus_a6_isolate_cv-a6/p395/2013/china_capsid_protein,_p2_protein,_and_rna-depe +GGCGCGCCAGTCATGTCTAGATCAAGCACTTCTGTCTCCCCGGATCGAGTATCAATAGACTGCTAGCGCG +GTTGAAGGAGAAAACGTCCGTTACCCGGCTAACTACTTCGAGAAACTTAGTAGCACCATTGAAGCTGCGG +AGTGTTTCGCTCAGCACTCCCCCAGTGTAGATCAGGTCGATGAGTCACTGCACTCCCCACGGGCGACCGT +GGCAGTGACTGCGTTGGCGGCCTGCCTATGGGGCAACCCATAGGACGCTCTAAAGTGGACATGGTGCGAA +GAGTCTATTGAGCTAGTTAGTAGTCCTCCGGCCCCTGAATGCGGCTAATCCTAACTGCGGAGCACATGCC +CTCAATCCAGGGGGTGGTGTGTCGTAACGGGCAACTCTGCAGCGGAACCGNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNAATGATCCCATTACAAATGCAGTGGAAAGCGCTGTGAGCGCGCTTGCCGACACCACAATA +TCCCGGGTGACCGCAGCCAACACTGCAGCTAGCACCCACTCCCTGGGAACAGGGCGTGTACCGGCATTGC +AAGCCGCAGAAACGGGAGCAAGCTCTAATGCTAGTGATGAGAACCTTATTGAGACTCGCTGTGTGATGAA +TCGAAACGGGGTTAATGAGGCGAGTGTGGAACACTTTTACTCTCGTGCAGGGCTGGTAGGAGTTGTGGAG +GTGAAGGACTCGGGCACTAGCCTGGATGGGTACACAGTTTGGCCCATAGATGTGATGGGCTTCGTGCAAC +AGCGGCGCAAGCTAGAGCTGTCAACATACATGCGCTTTGATGCCGAGTTCACTTTTGTGTCCAACCTCAA +TGACAGCACGACGCCCGGGATGCTGCTGCAGTATATGTATGTACCACCAGGGGCCCCTAAGCCGGATAGC +AGGAAATCATATCAATGGCAGACTGCTACTAACCCGTCGGTATTCGCAAAATTGAGTGATCCACCCCCCC +AGGTATCTGTCCCGTTCATGTCGCCAGCAACAGCTTATCAGTGGTTTTATGATGGTTACCCTACATTTGG +TGAGCACAAACAAGCTACCAATTTGCAATATGGGCAGTGTCCTAATAACATGATGGGCCATTTTGCCATC +CGAACAGTCAGTGAATCTACCACCGGGAAAAACGTCCACGTTCGGGTGTACATGAGAATTAAGCACGTGA +GAGCTTGGGTACCTAGACCCCTTCGATCCCAAGCTTATATGGTCAAGAACTACCCGACATACAGCCAAAC +AATAACTAACACTGCAACTGACCGTGCAAGTATAACCACCACGGATTATGAAGGCGGGGTACCAGCAAAC +CCACAAAGGACATCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGCAGATGTCAGAGATCTTTTGTGGC +TGGATGAAGAAGCTATGGAACAGGGTGTGTCAGATTACATCAAAGGGCTCGGTGACGCATTCGGAACTGG +TTTTACTGATGCAGTGTCTAGGGAGGTGGAAGCTCTTAAGAACTACCTTATAGGATCTGAAGGGGCTGTT +GAAAAGATCTTGAAGAATTTAATTAAATTGATCTCAGCATTAGTCATAGTGACCAGAAGTGATTATGACA +TGGTAACCCTTACAGCAACCTTGGCACTCATAGGGTGTCATGGCAGCCCCTGGGCGTGGATCAAGGCTAA +GACAGCATCCATCTTAGGCATCCCTATCGCCCAGAAGCAGAGTGCGTCATGGCTCAAGAAGTTTAACGAC +ATGGCCAATGCTGCCAAGGGATTTGAGTGGATTTCCAATAAGATTAGTAAATTTATTGATTGGCTCAAGG +AGAAAATTATACCAGCAGCTAGAGAGAAGGTCGAGTTTTTGAACAACCTAAAACAACTGCCATTGTTAGA +GAACCAAATCTCAAATCTGGAGCAGTCCGCCGCTTCGCAAGAAGACCTTGAGGCAATGTTTGGGAACGTA +TCGTATCTCGCTCACTTCTGCCGTAAATACCAACCACTTTATGCTACAGAAGCCAAAAGAGTTTATGCTT +TGGAAAAGAGGATGAACAATTACATGCAGTTCAAGAGCAAACACCGTATTGAACCTGTATGTCTTATCAT +CAGAGGCTCCCCANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGGAGAGATCCAATGGGTAAAGCCTAAC +AAAGAAACTGGGAGACTTAACATCAACGGGCCAACTCGCACCAAACTCGAACCTAGTGTGTTCCATGATA +TCTTTGAGGGCAACAAGGAACCAGCGGTCTTACACAGCAAAGACCCTCGCCTCGAGGTAGATTTTGAGCA +GGCATTGTTCTCCAAGTATGTAGGAAACACTATACATGAGCCTGATGAATATATCAAGGAGGCAGCCTTA +CATTATGCAAATCAGTTGAAGCAGCTACATATAGATACTTCTCAAATGAGCATGGAAGAGGCTTGCTACG +GTACAGACAATCTCGAGGCTATTGACCTTCACACTAGTGCAGGCTACCCCTACAGCGCCTTGGGGATCAA +GAAGAGGGATATTTTAGACCCCACCACCAGGGATGTGAGTAAGATGAAGTTCTACATGGACAAGTATGGT +CTTGACCTCCCTTACTCTACCTATGTTAAGGATGAGCTACGCTCAATAGATAAGATCAAGAAGGGGAAAT +CCCGCTTAATTGAAGCTAGCAGTTTGAATGACTCAGTTTACCTCAGAATGGCCTTCGGACATCTCTATGA +AACTTTCCATGCAAACCCTGGGACTGTGACTGGTTCGGCTGTGGGATGTAACCCAGACGTGTTCTGGAGC +AAGTTGCCAATCCTGCTCCCTGGTTCCCTCTTTGCTTTTGACTACTCGGGCTATGATGCTAGTCTCAGCC +CAGTTTGGTTTAGAGCATTGGAGCTAGTTCTTAGAGAGATAGGATACGGTAACGAGGCAGTCTCACTCAT +CGAAGGGATCAATCACACACACCATGTATATCGCAACAAAACTTATTGCGTACTTGGTGGGATGCCATCA +GGCTGTTCAGGAACATCCATCTTCAATTCAATGATTAACAACATCATCATTAGATCATTGCTTATCAAAA +CATTTAAGGGTATTGACCTGGATGAACTCAACATGGTTGCTTATGGGGACGATGTACTTGCTAGTTACCC +TTTTCCCATTGACTGCTCAGAACTAGCAAGAACGGGCAAGGAGTATGGTCTAACCATGACCCCTGCAGAT +AAGTCTCCTTGCTTCAATGAAGTTAATTGGGAAAATGCAACCTTTCTTAAGAGGGGTTTCTTGCCTGATG +AACAATTTCCATTTTTGATTCACCCCACCATGCCAATGAAGGAGATTCACGAATCCATTCGGTGGACTAA +GGATGCACGCAATACTCAAGATCACGTGCGATCCTTGTGTCTATTGGCGTGGCACAACGGCAAACAAGAA +TATGAAAAATTTGTAAGTGCAATTAGGTCTGTCCCAATAGGAAAGACACTGGCTATTCCAAATTATGAAA +ACCTGAGACGCAATTGGCTCGAATTATTC