Repository 'blast_parser'
hg clone https://toolshed.g2.bx.psu.edu/repos/earlhaminst/blast_parser

Changeset 3:70df762b48a8 (2017-10-03)
Previous changeset 2:376ed15e0d27 (2017-03-24) Next changeset 4:363f3480622d (2017-10-12)
Commit message:
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
modified:
blast_parser.xml
test-data/output.tabular
added:
blast_parser.py
test-data/output2.tabular
removed:
blast_parser.pl
b
diff -r 376ed15e0d27 -r 70df762b48a8 blast_parser.pl
--- a/blast_parser.pl Fri Mar 24 12:14:47 2017 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
@@ -1,43 +0,0 @@
-#!/usr/bin/perl
-use strict;
-use warnings;
-use List::Util qw(min max);
-
-# A simple Perl parser to convert a BLAST 12-column or 24-column output into a
-# 3-column input for hcluster_hg (id1, id2, weight):
-# parse_blast.pl <file>
-
-use constant LOG_E_10 => log(10);
-
-my $file1 = $ARGV[0];
-open my $fh1, '<', $file1;
-
-while (my $line = <$fh1>) {
-    my @row = split(/\t/, $line);
-
-    if ($row[0] eq $row[1]) {
-        # ignore self matching hits
-    } else {
-        # Convert evalue to an integer weight with max 100
-        my $weight = 100;
-
-        #if the evalue is 0, leave weight at 100
-        if ($row[10] != 0 && $row[10] != 0.0) {
-            $weight = min(100, positive_round(-1 * log10($row[10])));
-        }
-        print"$row[0]\t$row[1]\t$weight\n";
-    }
-}
-close $fh1;
-
-# Calculate logarithm to base 10 of a number
-sub log10 {
-    my $n = shift;
-    return log($n) / LOG_E_10;
-}
-
-# Round a positive float to the nearest integer
-sub positive_round{
-    my $n = shift;
-    return int($n + 0.5);
-}
b
diff -r 376ed15e0d27 -r 70df762b48a8 blast_parser.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/blast_parser.py Tue Oct 03 04:51:45 2017 -0400
[
@@ -0,0 +1,54 @@
+"""
+Simple parser to convert a BLAST 12-column or 24-column tabular output into a
+3-column tabular input for hcluster_hg (id1, id2, weight):
+"""
+import argparse
+import math
+from collections import OrderedDict
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('-i', metavar='in-file', type=argparse.FileType('rt'), required=True, help='Path to input file')
+
+    parser.add_argument('-o', metavar='out-file', type=argparse.FileType('wt'), required=True, help='Path to output file')
+
+    parser.add_argument('-r', action='store_true', default=False,
+                        dest='reciprocal',
+                        help='Annotate homolog pair')
+
+    parser.add_argument('--version', action='version', version='%(prog)s 1.0')
+
+    options = parser.parse_args()
+
+    results = OrderedDict()
+
+    for line in options.i:
+        line = line.rstrip()
+        line_cols = line.split('\t')
+        sequence1_id = line_cols[0]
+        sequence2_id = line_cols[1]
+        evalue = float(line_cols[10])
+
+        # Ignore self-matching hits
+        if sequence1_id != sequence2_id:
+            # Convert evalue to an integer weight with max 100
+            weight = 100
+
+            # If the evalue is 0, leave weight at 100
+            if evalue != 0.0:
+                weight = min(100, round(math.log10(evalue) / -2.0))
+
+            if (sequence1_id, sequence2_id) not in results:
+                results[(sequence1_id, sequence2_id)] = weight
+            else:
+                results[(sequence1_id, sequence2_id)] = max(results[(sequence1_id, sequence2_id)], weight)
+
+    for (sequence1_id, sequence2_id), weight in results.items():
+        if not options.reciprocal or (sequence2_id, sequence1_id) in results:
+            options.o.write("%s\t%s\t%d\n" % (sequence1_id, sequence2_id, weight))
+
+
+if __name__ == "__main__":
+    main()
b
diff -r 376ed15e0d27 -r 70df762b48a8 blast_parser.xml
--- a/blast_parser.xml Fri Mar 24 12:14:47 2017 -0400
+++ b/blast_parser.xml Tue Oct 03 04:51:45 2017 -0400
[
@@ -1,18 +1,22 @@
-<tool id="blast_parser" name="BLAST parser" version="0.1.1">
+<tool id="blast_parser" name="BLAST parser" version="0.1.2">
     <description>
         Convert 12- or 24-column BLAST output into 3-column hcluster_sg input
     </description>
 
     <command detect_errors="exit_code">
 <![CDATA[
-perl '$__tool_directory__/blast_parser.pl'
-'$input'
-> '$output'
+python '$__tool_directory__/blast_parser.py'
+-i '$input'
+-o '$output'
+#if $reciprocal
+    -r
+#end if
 ]]>
     </command>
 
     <inputs>
         <param name="input" type="data" format="tabular" label="Tabular data" help="BLAST 12 column tabular format data"/>
+        <param name="reciprocal" type="boolean" checked="false" label="Reciprocal results" help="returns only reciprocal results"/>
     </inputs>
 
     <outputs>
@@ -22,12 +26,18 @@
     <tests>
         <test>
             <param name="input" ftype="tabular" value="input.tabular" />
+            <param name="reciprocal" value="false" />
             <output name="output" file="output.tabular" />
         </test>
+        <test>
+            <param name="input" ftype="tabular" value="input.tabular" />
+            <param name="reciprocal" value="true" />
+            <output name="output" file="output2.tabular" />
+        </test>
     </tests>
     <help>
 <![CDATA[
-Simple tool to convert a 12- or 24-column BLAST output into a 3-column format (qseqid, sseqid, round(-1 * log10(evalue))) usable as input for the hcluster_sg tool.
+Simple tool to convert a 12- or 24-column BLAST output into a 3-column format (qseqid, sseqid, round(-1 * log10(evalue)/2)) usable as input for the hcluster_sg tool.
 ]]>
     </help>
     <citations>
b
diff -r 376ed15e0d27 -r 70df762b48a8 test-data/output.tabular
--- a/test-data/output.tabular Fri Mar 24 12:14:47 2017 -0400
+++ b/test-data/output.tabular Tue Oct 03 04:51:45 2017 -0400
b
@@ -4,24 +4,22 @@
 ENSPCAT00000008534_procaviacapensis_1 ENST00000378069_homosapiens_1 100
 ENSLOCT00000017020_lepisosteusoculatus_1 ENSLACT00000026689_latimeriachalumnae_1 100
 ENSCPOT00000000986_caviaporcellus_1 ENSCAFT00000022963_canisfamiliaris_1 100
-ENSTGUT00000016508_taeniopygiaguttata_1 ENSTGUT00000006603_taeniopygiaguttata_1 100
+ENSTGUT00000016508_taeniopygiaguttata_1 ENSTGUT00000006603_taeniopygiaguttata_1 79
 ENSPFOT00000010657_poeciliaformosa_1 ENSXMAT00000001796_xiphophorusmaculatus_1 100
 ENSDNOT00000016434_dasypusnovemcinctus_1 ENSDNOT00000036768_dasypusnovemcinctus_1 100
-ENSPMAT00000010398_petromyzonmarinus_1 ENSLACT00000015911_latimeriachalumnae_1 64
+ENSPMAT00000010398_petromyzonmarinus_1 ENSLACT00000015911_latimeriachalumnae_1 32
 ENSAMET00000018099_ailuropodamelanoleuca_1 ENSCAFT00000022939_canisfamiliaris_1 100
 ENSEEUT00000005606_erinaceuseuropaeus_1 ENSMPUT00000012759_mustelaputoriusfuro_1 100
 ENSSHAT00000006757_sarcophilusharrisii_1 ENSMODT00000026841_monodelphisdomestica_1 100
 ENSPSIT00000017454_pelodiscussinensis_1 ENSPSIT00000017443_pelodiscussinensis_1 100
 ENSPFOT00000022544_poeciliaformosa_1 ENSXMAT00000001796_xiphophorusmaculatus_1 100
 ENSMICT00000002052_microcebusmurinus_1 ENSCAFT00000022963_canisfamiliaris_1 100
-ENSMICT00000002052_microcebusmurinus_1 ENSCAFT00000022963_canisfamiliaris_1 46
 ENSRNOT00000066674_rattusnorvegicus_1 ENSMUST00000026013_musmusculus_1 100
 ENSFCAT00000013090_feliscatus_1 ENSAMET00000018029_ailuropodamelanoleuca_1 100
 ENSONIT00000020514_oreochromisniloticus_1 ENSPFOT00000009022_poeciliaformosa_1 100
 ENSLACT00000026572_latimeriachalumnae_1 ENSLACT00000015911_latimeriachalumnae_1 100
-ENSPMAT00000003449_petromyzonmarinus_1 ENSGGOT00000000206_gorillagorilla_1 100
+ENSPMAT00000003449_petromyzonmarinus_1 ENSGGOT00000000206_gorillagorilla_1 75
 ENSSART00000014230_sorexaraneus_1 ENSSTOT00000004965_ictidomystridecemlineatus_1 100
-ENSSART00000014230_sorexaraneus_1 ENSSTOT00000004965_ictidomystridecemlineatus_1 44
 ENSBTAT00000001698_bostaurus_1 ENSCAFT00000022963_canisfamiliaris_1 100
 ENSTBET00000006983_tupaiabelangeri_1 ENSAMET00000018099_ailuropodamelanoleuca_1 100
 ENSLACT00000014274_latimeriachalumnae_1 ENSLACT00000026689_latimeriachalumnae_1 100
@@ -38,8 +36,7 @@
 ENSXMAT00000001796_xiphophorusmaculatus_1 ENSONIT00000016435_oreochromisniloticus_1 100
 ENSSSCT00000013404_susscrofa_1 ENSPPYT00000023637_pongoabelii_1 100
 ENSGALT00000036672_gallusgallus_1 ENSMGAT00000016429_meleagrisgallopavo_1 100
-ENSOPRT00000017156_ochotonaprinceps_1 ENSOCUT00000001438_oryctolaguscuniculus_1 100
-ENSOPRT00000017156_ochotonaprinceps_1 ENSOCUT00000001438_oryctolaguscuniculus_1 17
+ENSOPRT00000017156_ochotonaprinceps_1 ENSOCUT00000001438_oryctolaguscuniculus_1 70
 ENSSTOT00000004988_ictidomystridecemlineatus_1 ENSPANT00000027606_papioanubis_1 100
 ENSECAT00000024641_equuscaballus_1 ENSCAFT00000022939_canisfamiliaris_1 100
 ENSAPLT00000013855_anasplatyrhynchos_1 ENSMGAT00000016431_meleagrisgallopavo_1 100
@@ -48,7 +45,7 @@
 ENSPTRT00000040521_pantroglodytes_1 ENSGGOT00000000206_gorillagorilla_1 100
 ENSPTRT00000040520_pantroglodytes_1 ENSPPYT00000023637_pongoabelii_1 100
 ENSMEUT00000003745_macropuseugenii_1 ENSMODT00000026841_monodelphisdomestica_1 100
-ENSMICT00000002042_microcebusmurinus_1 ENSDNOT00000039756_dasypusnovemcinctus_1 100
+ENSMICT00000002042_microcebusmurinus_1 ENSDNOT00000039756_dasypusnovemcinctus_1 85
 ENSXETT00000010517_xenopustropicalis_1 ENSXETT00000010521_xenopustropicalis_1 100
 ENSMODT00000026840_monodelphisdomestica_1 ENSCAFT00000022963_canisfamiliaris_1 100
 ENSMLUT00000001428_myotislucifugus_1 ENSAMET00000018099_ailuropodamelanoleuca_1 100
@@ -59,8 +56,8 @@
 ENSTRUT00000035430_takifugurubripes_1 ENSTNIT00000014720_tetraodonnigroviridis_1 100
 ENSMEUT00000006183_macropuseugenii_1 ENSMODT00000026840_monodelphisdomestica_1 100
 ENSFALT00000001591_ficedulaalbicollis_1 ENSTGUT00000006603_taeniopygiaguttata_1 100
-ENSMUST00000168613_musmusculus_1 ENSMUST00000040820_musmusculus_1 100
-ENSSSCT00000032764_susscrofa_1 ENSSSCT00000013404_susscrofa_1 100
+ENSMUST00000168613_musmusculus_1 ENSMUST00000040820_musmusculus_1 76
+ENSSSCT00000032764_susscrofa_1 ENSSSCT00000013404_susscrofa_1 53
 ENSGALT00000026158_gallusgallus_1 ENSMGAT00000016431_meleagrisgallopavo_1 100
 ENSDART00000160057_daniorerio_1 ENSDART00000132084_daniorerio_1 100
 ENSMPUT00000012759_mustelaputoriusfuro_1 ENSCAFT00000022939_canisfamiliaris_1 100
@@ -92,11 +89,10 @@
 ENSSTOT00000004965_ictidomystridecemlineatus_1 ENSECAT00000024641_equuscaballus_1 100
 ENSMLUT00000001440_myotislucifugus_1 ENSPANT00000027606_papioanubis_1 100
 ENSORLT00000017214_oryziaslatipes_1 ENSPFOT00000009022_poeciliaformosa_1 100
-ENSMUST00000163344_musmusculus_1 ENSMUST00000168613_musmusculus_1 63
+ENSMUST00000163344_musmusculus_1 ENSMUST00000168613_musmusculus_1 31
 ENSACAT00000017993_anoliscarolinensis_1 ENSPSIT00000017443_pelodiscussinensis_1 100
 ENSCJAT00000021080_callithrixjacchus_1 ENSCJAT00000058575_callithrixjacchus_1 100
-ENSOPRT00000000678_ochotonaprinceps_1 ENSOGAT00000030491_otolemurgarnettii_1 100
-ENSOPRT00000000678_ochotonaprinceps_1 ENSOGAT00000030491_otolemurgarnettii_1 19
+ENSOPRT00000000678_ochotonaprinceps_1 ENSOGAT00000030491_otolemurgarnettii_1 71
 ENSMMUT00000027384_macacamulatta_1 ENSPANT00000027701_papioanubis_1 100
 ENSMMUT00000027387_macacamulatta_1 ENSPANT00000027631_papioanubis_1 100
 ENSLOCT00000019886_lepisosteusoculatus_1 ENSDART00000160057_daniorerio_1 100
@@ -118,7 +114,7 @@
 ENSTTRT00000009129_tursiopstruncatus_1 ENSCAFT00000022963_canisfamiliaris_1 100
 ENSCAFT00000022963_canisfamiliaris_1 ENSAMET00000018029_ailuropodamelanoleuca_1 100
 ENSGGOT00000008973_gorillagorilla_1 ENSPPYT00000023637_pongoabelii_1 100
-ENSOGAT00000031973_otolemurgarnettii_1 ENSOGAT00000005620_otolemurgarnettii_1 100
+ENSOGAT00000031973_otolemurgarnettii_1 ENSOGAT00000005620_otolemurgarnettii_1 77
 ENSGACT00000024065_gasterosteusaculeatus_1 ENSGACT00000024064_gasterosteusaculeatus_1 100
 ENSGACT00000024064_gasterosteusaculeatus_1 ENSGACT00000024065_gasterosteusaculeatus_1 100
 ENSAMET00000018029_ailuropodamelanoleuca_1 ENSCAFT00000022963_canisfamiliaris_1 100
@@ -142,19 +138,19 @@
 ENSLAFT00000027936_loxodontaafricana_1 ENSLAFT00000015029_loxodontaafricana_1 100
 ENSPSIT00000016442_pelodiscussinensis_1 ENSAPLT00000013117_anasplatyrhynchos_1 100
 ENSOART00000003319_ovisaries_1 ENSBTAT00000021570_bostaurus_1 100
-ENSMMUT00000046681_macacamulatta_1 ENSPANT00000027701_papioanubis_1 28
-ENSMMUT00000046680_macacamulatta_1 ENSCSAT00000012035_chlorocebussabaeus_1 44
+ENSMMUT00000046681_macacamulatta_1 ENSPANT00000027701_papioanubis_1 14
+ENSMMUT00000046680_macacamulatta_1 ENSCSAT00000012035_chlorocebussabaeus_1 22
 ENSBTAT00000021570_bostaurus_1 ENSOART00000003319_ovisaries_1 100
 ENST00000378069_homosapiens_1 ENSGGOT00000000206_gorillagorilla_1 100
 ENSPANT00000027606_papioanubis_1 ENSPANT00000027631_papioanubis_1 100
 ENSLAFT00000000504_loxodontaafricana_1 ENSECAT00000024641_equuscaballus_1 100
 ENSPCAT00000006605_procaviacapensis_1 ENSLAFT00000000504_loxodontaafricana_1 100
 ENSLOCT00000002323_lepisosteusoculatus_1 ENSDART00000028225_daniorerio_1 100
-ENSMUST00000173143_musmusculus_1 ENSMUST00000163344_musmusculus_1 5
+ENSMUST00000173143_musmusculus_1 ENSMUST00000163344_musmusculus_1 3
 ENSRNOT00000044009_rattusnorvegicus_1 ENSMUST00000040820_musmusculus_1 100
 ENSMODT00000026841_monodelphisdomestica_1 ENSCAFT00000022939_canisfamiliaris_1 100
 ENSLACT00000015911_latimeriachalumnae_1 ENSLACT00000026572_latimeriachalumnae_1 100
-ENSSSCT00000035258_susscrofa_1 ENSSSCT00000013404_susscrofa_1 97
+ENSSSCT00000035258_susscrofa_1 ENSSSCT00000013404_susscrofa_1 48
 ENSTRUT00000011582_takifugurubripes_1 ENSTRUT00000011581_takifugurubripes_1 100
 ENSPSIT00000017443_pelodiscussinensis_1 ENSPSIT00000017454_pelodiscussinensis_1 100
 ENSTRUT00000011580_takifugurubripes_1 ENSTRUT00000011581_takifugurubripes_1 100
b
diff -r 376ed15e0d27 -r 70df762b48a8 test-data/output2.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output2.tabular Tue Oct 03 04:51:45 2017 -0400
b
@@ -0,0 +1,50 @@
+ENSFCAT00000013089_feliscatus_1 ENSCAFT00000022939_canisfamiliaris_1 100
+ENSCAFT00000022939_canisfamiliaris_1 ENSFCAT00000013089_feliscatus_1 100
+ENSAMXT00000002585_astyanaxmexicanus_1 ENSDART00000028225_daniorerio_1 100
+ENSPSIT00000017454_pelodiscussinensis_1 ENSPSIT00000017443_pelodiscussinensis_1 100
+ENSRNOT00000066674_rattusnorvegicus_1 ENSMUST00000026013_musmusculus_1 100
+ENSLACT00000026572_latimeriachalumnae_1 ENSLACT00000015911_latimeriachalumnae_1 100
+ENSTGUT00000006603_taeniopygiaguttata_1 ENSFALT00000001591_ficedulaalbicollis_1 100
+ENSFALT00000001560_ficedulaalbicollis_1 ENSTGUT00000006498_taeniopygiaguttata_1 100
+ENSXMAT00000001796_xiphophorusmaculatus_1 ENSONIT00000016435_oreochromisniloticus_1 100
+ENSGALT00000036672_gallusgallus_1 ENSMGAT00000016429_meleagrisgallopavo_1 100
+ENSAPLT00000013855_anasplatyrhynchos_1 ENSMGAT00000016431_meleagrisgallopavo_1 100
+ENSDART00000132084_daniorerio_1 ENSDART00000160057_daniorerio_1 100
+ENSPTRT00000040521_pantroglodytes_1 ENSGGOT00000000206_gorillagorilla_1 100
+ENSXETT00000010517_xenopustropicalis_1 ENSXETT00000010521_xenopustropicalis_1 100
+ENSTRUT00000035430_takifugurubripes_1 ENSTNIT00000014720_tetraodonnigroviridis_1 100
+ENSFALT00000001591_ficedulaalbicollis_1 ENSTGUT00000006603_taeniopygiaguttata_1 100
+ENSDART00000160057_daniorerio_1 ENSDART00000132084_daniorerio_1 100
+ENSLACT00000014695_latimeriachalumnae_1 ENSLACT00000026689_latimeriachalumnae_1 100
+ENSDART00000028225_daniorerio_1 ENSAMXT00000002585_astyanaxmexicanus_1 100
+ENSPANT00000027631_papioanubis_1 ENSMMUT00000027387_macacamulatta_1 100
+ENSMUST00000026013_musmusculus_1 ENSRNOT00000066674_rattusnorvegicus_1 100
+ENSONIT00000016435_oreochromisniloticus_1 ENSXMAT00000001796_xiphophorusmaculatus_1 100
+ENSTNIT00000014720_tetraodonnigroviridis_1 ENSTRUT00000035430_takifugurubripes_1 100
+ENSCJAT00000058575_callithrixjacchus_1 ENSCJAT00000021080_callithrixjacchus_1 100
+ENSCJAT00000021080_callithrixjacchus_1 ENSCJAT00000058575_callithrixjacchus_1 100
+ENSMMUT00000027387_macacamulatta_1 ENSPANT00000027631_papioanubis_1 100
+ENSMGAT00000016429_meleagrisgallopavo_1 ENSGALT00000036672_gallusgallus_1 100
+ENSSSCT00000023183_susscrofa_1 ENSSSCT00000033745_susscrofa_1 100
+ENSGGOT00000000206_gorillagorilla_1 ENSPTRT00000040521_pantroglodytes_1 100
+ENSXETT00000010521_xenopustropicalis_1 ENSXETT00000010517_xenopustropicalis_1 100
+ENSCAFT00000022963_canisfamiliaris_1 ENSAMET00000018029_ailuropodamelanoleuca_1 100
+ENSGACT00000024065_gasterosteusaculeatus_1 ENSGACT00000024064_gasterosteusaculeatus_1 100
+ENSGACT00000024064_gasterosteusaculeatus_1 ENSGACT00000024065_gasterosteusaculeatus_1 100
+ENSAMET00000018029_ailuropodamelanoleuca_1 ENSCAFT00000022963_canisfamiliaris_1 100
+ENSSSCT00000033745_susscrofa_1 ENSSSCT00000023183_susscrofa_1 100
+ENSMGAT00000016431_meleagrisgallopavo_1 ENSAPLT00000013855_anasplatyrhynchos_1 100
+ENSPPYT00000023640_pongoabelii_1 ENSPPYT00000023641_pongoabelii_1 100
+ENSPPYT00000023641_pongoabelii_1 ENSPPYT00000023640_pongoabelii_1 100
+ENSTGUT00000006498_taeniopygiaguttata_1 ENSFALT00000001560_ficedulaalbicollis_1 100
+ENSLACT00000026689_latimeriachalumnae_1 ENSLACT00000014695_latimeriachalumnae_1 100
+ENSPANT00000027701_papioanubis_1 ENSPPYT00000023637_pongoabelii_1 100
+ENSPPYT00000023637_pongoabelii_1 ENSPANT00000027701_papioanubis_1 100
+ENSMUST00000040820_musmusculus_1 ENSRNOT00000044009_rattusnorvegicus_1 100
+ENSOART00000003319_ovisaries_1 ENSBTAT00000021570_bostaurus_1 100
+ENSBTAT00000021570_bostaurus_1 ENSOART00000003319_ovisaries_1 100
+ENSRNOT00000044009_rattusnorvegicus_1 ENSMUST00000040820_musmusculus_1 100
+ENSLACT00000015911_latimeriachalumnae_1 ENSLACT00000026572_latimeriachalumnae_1 100
+ENSTRUT00000011582_takifugurubripes_1 ENSTRUT00000011581_takifugurubripes_1 100
+ENSPSIT00000017443_pelodiscussinensis_1 ENSPSIT00000017454_pelodiscussinensis_1 100
+ENSTRUT00000011581_takifugurubripes_1 ENSTRUT00000011582_takifugurubripes_1 100