changeset 0:f3b63b59a1ea draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/blast_unmatched commit 5bd801feb838592fbb1f6dd68b5f1a480042da40
author artbio
date Tue, 03 Oct 2017 07:19:17 -0400
parents
children 50c1fa95a076
files blast_unmatched.py blast_unmatched.xml test-data/test_blast.tab test-data/test_output.fa test-data/test_query.fa
diffstat 5 files changed, 167 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/blast_unmatched.py	Tue Oct 03 07:19:17 2017 -0400
@@ -0,0 +1,60 @@
+#!/usr/bin/env python3
+
+import optparse
+
+
+def parse_options():
+    """
+    Parse the options guiven to the script
+    """
+    parser = optparse.OptionParser(description='Get unmatched blast queries')
+    parser.add_option('-f','--fasta', dest='fasta_file', help='Query fasta file\
+used during blast')
+    parser.add_option('-b','--blast', dest='blast_file', help='Blast tabular\
+output (queries in 1rst column)')
+    parser.add_option('-o','--output', dest='output_file', help='Output file name')
+    (options, args) = parser.parse_args()
+    if len(args) > 0:
+        parser.error('Wrong number of arguments')
+    return options
+
+def get_matched(blast_file):
+    """
+    Get a dictionary of all the queries that got a match
+    """
+    matched = dict()
+    blast_file_handle = open(blast_file, 'r')
+    for line in blast_file_handle.readlines():
+        fields = line.split("\t")
+        query_id = fields[0]
+        matched[query_id] = 1
+    blast_file_handle.close()
+    return matched
+
+def get_unmatched(output_file, fasta_file, matched):
+    """
+    Compares matched queries to query fasta file and print unmatched to ouput
+    """
+    output_file_handle = open(output_file, 'w')
+    fasta_file_handle = open(fasta_file, 'r')
+    unmatched = False
+    for line in fasta_file_handle.readlines():
+        if line.startswith('>'):
+            subline = line[1:100].rstrip() #qid are 100chars long in blast
+            if subline not in matched:
+                output_file_handle.write(line)
+                unmatched = True
+            else:
+                unmatched = False
+        elif unmatched:
+            output_file_handle.write(line)
+    fasta_file_handle.close()
+    output_file_handle.close()
+
+def __main__():
+    opts = parse_options()
+    matched = get_matched(opts.blast_file)
+    get_unmatched(opts.output_file, opts.fasta_file, matched)
+
+if __main__():
+    __main__()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/blast_unmatched.xml	Tue Oct 03 07:19:17 2017 -0400
@@ -0,0 +1,35 @@
+<tool id="blast_unmatched" name="Blast Unmatched" version="0.1.0">
+    <description>get query sequences that didn't get a match during a blast</description>
+    <requirements>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+        python '$__tool_directory__'/blast_unmatched.py
+            --fasta $fasta_file
+            --blast $blast_file
+            --output $output_file
+    ]]></command>
+    <inputs>
+        <param type="data" name="fasta_file" format="fasta"/>
+        <param type="data" name="blast_file" format="tabular"/>
+    </inputs>
+    <outputs>
+        <data name="output_file" format="fasta" label="Unmatched queries from blast: ${blast_file.name}"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="blast_file" value="test_blast.tab"/>
+            <param name="fasta_file" value="test_query.fa"/>
+            <output name="output_file" ftype="fasta" file="test_output.fa"/>
+        </test>
+    </tests>
+    <help><![CDATA[
+**What it does**
+
+This tool takes a `blast`_ output in tabular format(with the query id in 1rst column) and the fasta file used as query. It then return the query sequences that remained unmatched during the blast.
+
+.. _blast: https://blast.ncbi.nlm.nih.gov/Blast.cgi
+
+    ]]></help>
+    <citations>
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_blast.tab	Tue Oct 03 07:19:17 2017 -0400
@@ -0,0 +1,3 @@
+kp289641.1_coxsackievirus_a6_isolate_cv-a6/p395/2013/china_capsid_protein,_p2_protein,_and_rna-depe	kr815992.1_coxsackievirus_a6_isolate_12743/gz/chn/2013,_complete_genome	96.18	1386	53	0	2354	3739	5952	7337	0.0	2260
+kp289641.1_coxsackievirus_a6_isolate_cv-a6/p395/2013/china_capsid_protein,_p2_protein,_and_rna-depe	kr815992.1_coxsackievirus_a6_isolate_12743/gz/chn/2013,_complete_genome	96.94	915	28	0	501	1415	2430	3344	0.0	1525
+kp289641.1_coxsackievirus_a6_isolate_cv-a6/p395/2013/china_capsid_protein,_p2_protein,_and_rna-depe	kr815992.1_coxsackievirus_a6_isolate_12743/gz/chn/2013,_complete_genome	95.25	737	35	0	1516	2252	3747	4483	0.0	1171
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_output.fa	Tue Oct 03 07:19:17 2017 -0400
@@ -0,0 +1,7 @@
+>kc879361.1_coxsackievirus_a16_strain_34351_cva16_rus_2009_2c_protein_gene,_partial_cds
+CCACTCTACGCGACTGAAGCCAAAAGAGTTTATGCTCTAGAGAAAAGGATGAACAACTACATGCAGTTCA
+AGAGCAAACACCGTATTGAACCTGTATGTCTGATCATTAGAGGCTCTCCAGGAACAGGTAAGTCACTTGC
+TACGGGCATCATAGCTAGAGCTATTGCAGACAAATATCATTCTAGTGTTTATTCACTCCCTCCAGACCCA
+GACCATTTTGATGGGTACAAGCAACAAGTAGTCACTGTTATGGATGACCTCTGCCAAAATCCAGATGGGA
+AGGACATGTCTCTATTTTGTCAAATGGTTTCTACAGTCGACTTTATACCACCCATGGCATCACTGGAAGA
+GAAAGGAGTGTCTTTCACCTCTAAGTTCGTCATAGCATCAACCAATGCTAGTAATATTGTAGTTCCT
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_query.fa	Tue Oct 03 07:19:17 2017 -0400
@@ -0,0 +1,62 @@
+>kc879361.1_coxsackievirus_a16_strain_34351_cva16_rus_2009_2c_protein_gene,_partial_cds
+CCACTCTACGCGACTGAAGCCAAAAGAGTTTATGCTCTAGAGAAAAGGATGAACAACTACATGCAGTTCA
+AGAGCAAACACCGTATTGAACCTGTATGTCTGATCATTAGAGGCTCTCCAGGAACAGGTAAGTCACTTGC
+TACGGGCATCATAGCTAGAGCTATTGCAGACAAATATCATTCTAGTGTTTATTCACTCCCTCCAGACCCA
+GACCATTTTGATGGGTACAAGCAACAAGTAGTCACTGTTATGGATGACCTCTGCCAAAATCCAGATGGGA
+AGGACATGTCTCTATTTTGTCAAATGGTTTCTACAGTCGACTTTATACCACCCATGGCATCACTGGAAGA
+GAAAGGAGTGTCTTTCACCTCTAAGTTCGTCATAGCATCAACCAATGCTAGTAATATTGTAGTTCCT
+>kp289641.1_coxsackievirus_a6_isolate_cv-a6/p395/2013/china_capsid_protein,_p2_protein,_and_rna-depe
+GGCGCGCCAGTCATGTCTAGATCAAGCACTTCTGTCTCCCCGGATCGAGTATCAATAGACTGCTAGCGCG
+GTTGAAGGAGAAAACGTCCGTTACCCGGCTAACTACTTCGAGAAACTTAGTAGCACCATTGAAGCTGCGG
+AGTGTTTCGCTCAGCACTCCCCCAGTGTAGATCAGGTCGATGAGTCACTGCACTCCCCACGGGCGACCGT
+GGCAGTGACTGCGTTGGCGGCCTGCCTATGGGGCAACCCATAGGACGCTCTAAAGTGGACATGGTGCGAA
+GAGTCTATTGAGCTAGTTAGTAGTCCTCCGGCCCCTGAATGCGGCTAATCCTAACTGCGGAGCACATGCC
+CTCAATCCAGGGGGTGGTGTGTCGTAACGGGCAACTCTGCAGCGGAACCGNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNAATGATCCCATTACAAATGCAGTGGAAAGCGCTGTGAGCGCGCTTGCCGACACCACAATA
+TCCCGGGTGACCGCAGCCAACACTGCAGCTAGCACCCACTCCCTGGGAACAGGGCGTGTACCGGCATTGC
+AAGCCGCAGAAACGGGAGCAAGCTCTAATGCTAGTGATGAGAACCTTATTGAGACTCGCTGTGTGATGAA
+TCGAAACGGGGTTAATGAGGCGAGTGTGGAACACTTTTACTCTCGTGCAGGGCTGGTAGGAGTTGTGGAG
+GTGAAGGACTCGGGCACTAGCCTGGATGGGTACACAGTTTGGCCCATAGATGTGATGGGCTTCGTGCAAC
+AGCGGCGCAAGCTAGAGCTGTCAACATACATGCGCTTTGATGCCGAGTTCACTTTTGTGTCCAACCTCAA
+TGACAGCACGACGCCCGGGATGCTGCTGCAGTATATGTATGTACCACCAGGGGCCCCTAAGCCGGATAGC
+AGGAAATCATATCAATGGCAGACTGCTACTAACCCGTCGGTATTCGCAAAATTGAGTGATCCACCCCCCC
+AGGTATCTGTCCCGTTCATGTCGCCAGCAACAGCTTATCAGTGGTTTTATGATGGTTACCCTACATTTGG
+TGAGCACAAACAAGCTACCAATTTGCAATATGGGCAGTGTCCTAATAACATGATGGGCCATTTTGCCATC
+CGAACAGTCAGTGAATCTACCACCGGGAAAAACGTCCACGTTCGGGTGTACATGAGAATTAAGCACGTGA
+GAGCTTGGGTACCTAGACCCCTTCGATCCCAAGCTTATATGGTCAAGAACTACCCGACATACAGCCAAAC
+AATAACTAACACTGCAACTGACCGTGCAAGTATAACCACCACGGATTATGAAGGCGGGGTACCAGCAAAC
+CCACAAAGGACATCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGCAGATGTCAGAGATCTTTTGTGGC
+TGGATGAAGAAGCTATGGAACAGGGTGTGTCAGATTACATCAAAGGGCTCGGTGACGCATTCGGAACTGG
+TTTTACTGATGCAGTGTCTAGGGAGGTGGAAGCTCTTAAGAACTACCTTATAGGATCTGAAGGGGCTGTT
+GAAAAGATCTTGAAGAATTTAATTAAATTGATCTCAGCATTAGTCATAGTGACCAGAAGTGATTATGACA
+TGGTAACCCTTACAGCAACCTTGGCACTCATAGGGTGTCATGGCAGCCCCTGGGCGTGGATCAAGGCTAA
+GACAGCATCCATCTTAGGCATCCCTATCGCCCAGAAGCAGAGTGCGTCATGGCTCAAGAAGTTTAACGAC
+ATGGCCAATGCTGCCAAGGGATTTGAGTGGATTTCCAATAAGATTAGTAAATTTATTGATTGGCTCAAGG
+AGAAAATTATACCAGCAGCTAGAGAGAAGGTCGAGTTTTTGAACAACCTAAAACAACTGCCATTGTTAGA
+GAACCAAATCTCAAATCTGGAGCAGTCCGCCGCTTCGCAAGAAGACCTTGAGGCAATGTTTGGGAACGTA
+TCGTATCTCGCTCACTTCTGCCGTAAATACCAACCACTTTATGCTACAGAAGCCAAAAGAGTTTATGCTT
+TGGAAAAGAGGATGAACAATTACATGCAGTTCAAGAGCAAACACCGTATTGAACCTGTATGTCTTATCAT
+CAGAGGCTCCCCANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGGAGAGATCCAATGGGTAAAGCCTAAC
+AAAGAAACTGGGAGACTTAACATCAACGGGCCAACTCGCACCAAACTCGAACCTAGTGTGTTCCATGATA
+TCTTTGAGGGCAACAAGGAACCAGCGGTCTTACACAGCAAAGACCCTCGCCTCGAGGTAGATTTTGAGCA
+GGCATTGTTCTCCAAGTATGTAGGAAACACTATACATGAGCCTGATGAATATATCAAGGAGGCAGCCTTA
+CATTATGCAAATCAGTTGAAGCAGCTACATATAGATACTTCTCAAATGAGCATGGAAGAGGCTTGCTACG
+GTACAGACAATCTCGAGGCTATTGACCTTCACACTAGTGCAGGCTACCCCTACAGCGCCTTGGGGATCAA
+GAAGAGGGATATTTTAGACCCCACCACCAGGGATGTGAGTAAGATGAAGTTCTACATGGACAAGTATGGT
+CTTGACCTCCCTTACTCTACCTATGTTAAGGATGAGCTACGCTCAATAGATAAGATCAAGAAGGGGAAAT
+CCCGCTTAATTGAAGCTAGCAGTTTGAATGACTCAGTTTACCTCAGAATGGCCTTCGGACATCTCTATGA
+AACTTTCCATGCAAACCCTGGGACTGTGACTGGTTCGGCTGTGGGATGTAACCCAGACGTGTTCTGGAGC
+AAGTTGCCAATCCTGCTCCCTGGTTCCCTCTTTGCTTTTGACTACTCGGGCTATGATGCTAGTCTCAGCC
+CAGTTTGGTTTAGAGCATTGGAGCTAGTTCTTAGAGAGATAGGATACGGTAACGAGGCAGTCTCACTCAT
+CGAAGGGATCAATCACACACACCATGTATATCGCAACAAAACTTATTGCGTACTTGGTGGGATGCCATCA
+GGCTGTTCAGGAACATCCATCTTCAATTCAATGATTAACAACATCATCATTAGATCATTGCTTATCAAAA
+CATTTAAGGGTATTGACCTGGATGAACTCAACATGGTTGCTTATGGGGACGATGTACTTGCTAGTTACCC
+TTTTCCCATTGACTGCTCAGAACTAGCAAGAACGGGCAAGGAGTATGGTCTAACCATGACCCCTGCAGAT
+AAGTCTCCTTGCTTCAATGAAGTTAATTGGGAAAATGCAACCTTTCTTAAGAGGGGTTTCTTGCCTGATG
+AACAATTTCCATTTTTGATTCACCCCACCATGCCAATGAAGGAGATTCACGAATCCATTCGGTGGACTAA
+GGATGCACGCAATACTCAAGATCACGTGCGATCCTTGTGTCTATTGGCGTGGCACAACGGCAAACAAGAA
+TATGAAAAATTTGTAAGTGCAATTAGGTCTGTCCCAATAGGAAAGACACTGGCTATTCCAAATTATGAAA
+ACCTGAGACGCAATTGGCTCGAATTATTC