# HG changeset patch # User artbio # Date 1507029557 14400 # Node ID f3b63b59a1ea63ce84570ac6cadc31aff303ffc8 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/blast_unmatched commit 5bd801feb838592fbb1f6dd68b5f1a480042da40 diff -r 000000000000 -r f3b63b59a1ea blast_unmatched.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/blast_unmatched.py Tue Oct 03 07:19:17 2017 -0400 @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 + +import optparse + + +def parse_options(): + """ + Parse the options guiven to the script + """ + parser = optparse.OptionParser(description='Get unmatched blast queries') + parser.add_option('-f','--fasta', dest='fasta_file', help='Query fasta file\ +used during blast') + parser.add_option('-b','--blast', dest='blast_file', help='Blast tabular\ +output (queries in 1rst column)') + parser.add_option('-o','--output', dest='output_file', help='Output file name') + (options, args) = parser.parse_args() + if len(args) > 0: + parser.error('Wrong number of arguments') + return options + +def get_matched(blast_file): + """ + Get a dictionary of all the queries that got a match + """ + matched = dict() + blast_file_handle = open(blast_file, 'r') + for line in blast_file_handle.readlines(): + fields = line.split("\t") + query_id = fields[0] + matched[query_id] = 1 + blast_file_handle.close() + return matched + +def get_unmatched(output_file, fasta_file, matched): + """ + Compares matched queries to query fasta file and print unmatched to ouput + """ + output_file_handle = open(output_file, 'w') + fasta_file_handle = open(fasta_file, 'r') + unmatched = False + for line in fasta_file_handle.readlines(): + if line.startswith('>'): + subline = line[1:100].rstrip() #qid are 100chars long in blast + if subline not in matched: + output_file_handle.write(line) + unmatched = True + else: + unmatched = False + elif unmatched: + output_file_handle.write(line) + fasta_file_handle.close() + output_file_handle.close() + +def __main__(): + opts = parse_options() + matched = get_matched(opts.blast_file) + get_unmatched(opts.output_file, opts.fasta_file, matched) + +if __main__(): + __main__() diff -r 000000000000 -r f3b63b59a1ea blast_unmatched.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/blast_unmatched.xml Tue Oct 03 07:19:17 2017 -0400 @@ -0,0 +1,35 @@ + + get query sequences that didn't get a match during a blast + + + + + + + + + + + + + + + + + + + + + diff -r 000000000000 -r f3b63b59a1ea test-data/test_blast.tab --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_blast.tab Tue Oct 03 07:19:17 2017 -0400 @@ -0,0 +1,3 @@ +kp289641.1_coxsackievirus_a6_isolate_cv-a6/p395/2013/china_capsid_protein,_p2_protein,_and_rna-depe kr815992.1_coxsackievirus_a6_isolate_12743/gz/chn/2013,_complete_genome 96.18 1386 53 0 2354 3739 5952 7337 0.0 2260 +kp289641.1_coxsackievirus_a6_isolate_cv-a6/p395/2013/china_capsid_protein,_p2_protein,_and_rna-depe kr815992.1_coxsackievirus_a6_isolate_12743/gz/chn/2013,_complete_genome 96.94 915 28 0 501 1415 2430 3344 0.0 1525 +kp289641.1_coxsackievirus_a6_isolate_cv-a6/p395/2013/china_capsid_protein,_p2_protein,_and_rna-depe kr815992.1_coxsackievirus_a6_isolate_12743/gz/chn/2013,_complete_genome 95.25 737 35 0 1516 2252 3747 4483 0.0 1171 diff -r 000000000000 -r f3b63b59a1ea test-data/test_output.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_output.fa Tue Oct 03 07:19:17 2017 -0400 @@ -0,0 +1,7 @@ +>kc879361.1_coxsackievirus_a16_strain_34351_cva16_rus_2009_2c_protein_gene,_partial_cds +CCACTCTACGCGACTGAAGCCAAAAGAGTTTATGCTCTAGAGAAAAGGATGAACAACTACATGCAGTTCA +AGAGCAAACACCGTATTGAACCTGTATGTCTGATCATTAGAGGCTCTCCAGGAACAGGTAAGTCACTTGC +TACGGGCATCATAGCTAGAGCTATTGCAGACAAATATCATTCTAGTGTTTATTCACTCCCTCCAGACCCA +GACCATTTTGATGGGTACAAGCAACAAGTAGTCACTGTTATGGATGACCTCTGCCAAAATCCAGATGGGA +AGGACATGTCTCTATTTTGTCAAATGGTTTCTACAGTCGACTTTATACCACCCATGGCATCACTGGAAGA +GAAAGGAGTGTCTTTCACCTCTAAGTTCGTCATAGCATCAACCAATGCTAGTAATATTGTAGTTCCT diff -r 000000000000 -r f3b63b59a1ea test-data/test_query.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_query.fa Tue Oct 03 07:19:17 2017 -0400 @@ -0,0 +1,62 @@ +>kc879361.1_coxsackievirus_a16_strain_34351_cva16_rus_2009_2c_protein_gene,_partial_cds +CCACTCTACGCGACTGAAGCCAAAAGAGTTTATGCTCTAGAGAAAAGGATGAACAACTACATGCAGTTCA +AGAGCAAACACCGTATTGAACCTGTATGTCTGATCATTAGAGGCTCTCCAGGAACAGGTAAGTCACTTGC +TACGGGCATCATAGCTAGAGCTATTGCAGACAAATATCATTCTAGTGTTTATTCACTCCCTCCAGACCCA +GACCATTTTGATGGGTACAAGCAACAAGTAGTCACTGTTATGGATGACCTCTGCCAAAATCCAGATGGGA +AGGACATGTCTCTATTTTGTCAAATGGTTTCTACAGTCGACTTTATACCACCCATGGCATCACTGGAAGA +GAAAGGAGTGTCTTTCACCTCTAAGTTCGTCATAGCATCAACCAATGCTAGTAATATTGTAGTTCCT +>kp289641.1_coxsackievirus_a6_isolate_cv-a6/p395/2013/china_capsid_protein,_p2_protein,_and_rna-depe +GGCGCGCCAGTCATGTCTAGATCAAGCACTTCTGTCTCCCCGGATCGAGTATCAATAGACTGCTAGCGCG +GTTGAAGGAGAAAACGTCCGTTACCCGGCTAACTACTTCGAGAAACTTAGTAGCACCATTGAAGCTGCGG +AGTGTTTCGCTCAGCACTCCCCCAGTGTAGATCAGGTCGATGAGTCACTGCACTCCCCACGGGCGACCGT +GGCAGTGACTGCGTTGGCGGCCTGCCTATGGGGCAACCCATAGGACGCTCTAAAGTGGACATGGTGCGAA +GAGTCTATTGAGCTAGTTAGTAGTCCTCCGGCCCCTGAATGCGGCTAATCCTAACTGCGGAGCACATGCC +CTCAATCCAGGGGGTGGTGTGTCGTAACGGGCAACTCTGCAGCGGAACCGNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNAATGATCCCATTACAAATGCAGTGGAAAGCGCTGTGAGCGCGCTTGCCGACACCACAATA +TCCCGGGTGACCGCAGCCAACACTGCAGCTAGCACCCACTCCCTGGGAACAGGGCGTGTACCGGCATTGC +AAGCCGCAGAAACGGGAGCAAGCTCTAATGCTAGTGATGAGAACCTTATTGAGACTCGCTGTGTGATGAA +TCGAAACGGGGTTAATGAGGCGAGTGTGGAACACTTTTACTCTCGTGCAGGGCTGGTAGGAGTTGTGGAG +GTGAAGGACTCGGGCACTAGCCTGGATGGGTACACAGTTTGGCCCATAGATGTGATGGGCTTCGTGCAAC +AGCGGCGCAAGCTAGAGCTGTCAACATACATGCGCTTTGATGCCGAGTTCACTTTTGTGTCCAACCTCAA +TGACAGCACGACGCCCGGGATGCTGCTGCAGTATATGTATGTACCACCAGGGGCCCCTAAGCCGGATAGC +AGGAAATCATATCAATGGCAGACTGCTACTAACCCGTCGGTATTCGCAAAATTGAGTGATCCACCCCCCC +AGGTATCTGTCCCGTTCATGTCGCCAGCAACAGCTTATCAGTGGTTTTATGATGGTTACCCTACATTTGG +TGAGCACAAACAAGCTACCAATTTGCAATATGGGCAGTGTCCTAATAACATGATGGGCCATTTTGCCATC +CGAACAGTCAGTGAATCTACCACCGGGAAAAACGTCCACGTTCGGGTGTACATGAGAATTAAGCACGTGA +GAGCTTGGGTACCTAGACCCCTTCGATCCCAAGCTTATATGGTCAAGAACTACCCGACATACAGCCAAAC +AATAACTAACACTGCAACTGACCGTGCAAGTATAACCACCACGGATTATGAAGGCGGGGTACCAGCAAAC +CCACAAAGGACATCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGCAGATGTCAGAGATCTTTTGTGGC +TGGATGAAGAAGCTATGGAACAGGGTGTGTCAGATTACATCAAAGGGCTCGGTGACGCATTCGGAACTGG +TTTTACTGATGCAGTGTCTAGGGAGGTGGAAGCTCTTAAGAACTACCTTATAGGATCTGAAGGGGCTGTT +GAAAAGATCTTGAAGAATTTAATTAAATTGATCTCAGCATTAGTCATAGTGACCAGAAGTGATTATGACA +TGGTAACCCTTACAGCAACCTTGGCACTCATAGGGTGTCATGGCAGCCCCTGGGCGTGGATCAAGGCTAA +GACAGCATCCATCTTAGGCATCCCTATCGCCCAGAAGCAGAGTGCGTCATGGCTCAAGAAGTTTAACGAC +ATGGCCAATGCTGCCAAGGGATTTGAGTGGATTTCCAATAAGATTAGTAAATTTATTGATTGGCTCAAGG +AGAAAATTATACCAGCAGCTAGAGAGAAGGTCGAGTTTTTGAACAACCTAAAACAACTGCCATTGTTAGA +GAACCAAATCTCAAATCTGGAGCAGTCCGCCGCTTCGCAAGAAGACCTTGAGGCAATGTTTGGGAACGTA +TCGTATCTCGCTCACTTCTGCCGTAAATACCAACCACTTTATGCTACAGAAGCCAAAAGAGTTTATGCTT +TGGAAAAGAGGATGAACAATTACATGCAGTTCAAGAGCAAACACCGTATTGAACCTGTATGTCTTATCAT +CAGAGGCTCCCCANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGGAGAGATCCAATGGGTAAAGCCTAAC +AAAGAAACTGGGAGACTTAACATCAACGGGCCAACTCGCACCAAACTCGAACCTAGTGTGTTCCATGATA +TCTTTGAGGGCAACAAGGAACCAGCGGTCTTACACAGCAAAGACCCTCGCCTCGAGGTAGATTTTGAGCA +GGCATTGTTCTCCAAGTATGTAGGAAACACTATACATGAGCCTGATGAATATATCAAGGAGGCAGCCTTA +CATTATGCAAATCAGTTGAAGCAGCTACATATAGATACTTCTCAAATGAGCATGGAAGAGGCTTGCTACG +GTACAGACAATCTCGAGGCTATTGACCTTCACACTAGTGCAGGCTACCCCTACAGCGCCTTGGGGATCAA +GAAGAGGGATATTTTAGACCCCACCACCAGGGATGTGAGTAAGATGAAGTTCTACATGGACAAGTATGGT +CTTGACCTCCCTTACTCTACCTATGTTAAGGATGAGCTACGCTCAATAGATAAGATCAAGAAGGGGAAAT +CCCGCTTAATTGAAGCTAGCAGTTTGAATGACTCAGTTTACCTCAGAATGGCCTTCGGACATCTCTATGA +AACTTTCCATGCAAACCCTGGGACTGTGACTGGTTCGGCTGTGGGATGTAACCCAGACGTGTTCTGGAGC +AAGTTGCCAATCCTGCTCCCTGGTTCCCTCTTTGCTTTTGACTACTCGGGCTATGATGCTAGTCTCAGCC +CAGTTTGGTTTAGAGCATTGGAGCTAGTTCTTAGAGAGATAGGATACGGTAACGAGGCAGTCTCACTCAT +CGAAGGGATCAATCACACACACCATGTATATCGCAACAAAACTTATTGCGTACTTGGTGGGATGCCATCA +GGCTGTTCAGGAACATCCATCTTCAATTCAATGATTAACAACATCATCATTAGATCATTGCTTATCAAAA +CATTTAAGGGTATTGACCTGGATGAACTCAACATGGTTGCTTATGGGGACGATGTACTTGCTAGTTACCC +TTTTCCCATTGACTGCTCAGAACTAGCAAGAACGGGCAAGGAGTATGGTCTAACCATGACCCCTGCAGAT +AAGTCTCCTTGCTTCAATGAAGTTAATTGGGAAAATGCAACCTTTCTTAAGAGGGGTTTCTTGCCTGATG +AACAATTTCCATTTTTGATTCACCCCACCATGCCAATGAAGGAGATTCACGAATCCATTCGGTGGACTAA +GGATGCACGCAATACTCAAGATCACGTGCGATCCTTGTGTCTATTGGCGTGGCACAACGGCAAACAAGAA +TATGAAAAATTTGTAAGTGCAATTAGGTCTGTCCCAATAGGAAAGACACTGGCTATTCCAAATTATGAAA +ACCTGAGACGCAATTGGCTCGAATTATTC