0
|
1 <tool name="blasttoolssearch" id="blasttoolssearch" version="3.0">
|
|
2 <!--Source in git at: https://github.com/fubar2/galaxy-->
|
|
3 <!--Created by toolfactory@galaxy.org at 19/07/2023 12:39:19 using the Galaxy Tool Factory.-->
|
|
4 <description>Runs a legacy Java jar called blasttools from https://github.com/schmidda/blast-tools/tree/master</description>
|
|
5 <requirements>
|
|
6 <requirement type="package">csvtk</requirement>
|
|
7 <requirement type="package">openjdk</requirement>
|
|
8 </requirements>
|
|
9 <stdio>
|
|
10 <exit_code range="1:" level="fatal"/>
|
|
11 </stdio>
|
|
12 <version_command><![CDATA[echo "3.0"]]></version_command>
|
|
13 <command><![CDATA[bash
|
|
14 $runme
|
2
|
15 $blastn_search_outputs
|
0
|
16 $__tool_directory__/BlastTools.jar
|
|
17 $summary_viruses_viroids
|
|
18 ]]></command>
|
|
19 <configfiles>
|
|
20 <configfile name="runme"><![CDATA[#raw
|
|
21
|
|
22
|
|
23 ## eResearch Office, QUT
|
|
24 ## Created: 31 March 2021
|
|
25 ## Last modified: 28 September 2022
|
|
26 ## Script: Processes Galaxy Australia generated blastN outputs to summarise and report hits to REGULATED and ENDEMIC viruses/viroids.
|
|
27 ## Usage: ./run_VirReport_Summary.sh
|
|
28 ## changed to accept a single input file name passed as $1
|
|
29 ## Ross Lazarus for a ToolFactory wrapper for Robert Barrero
|
|
30 ## July 18 2023
|
|
31
|
|
32 dataPath=${PWD}
|
|
33
|
|
34 # Requirement: One or more GA-VSD .tabular outputs need to be in the folder where the command above (Usage)is executed.
|
|
35 # The script will Look for all files with the suffix *.tabular
|
|
36
|
|
37 #Processing tabular files
|
|
38 file=$1
|
|
39
|
|
40 var=$(basename $file)
|
|
41
|
|
42 #STEP0: fetch Top 1 Hits
|
|
43 cat $file | awk '{print $1}' | sort | uniq > ${var}.top1.ids
|
|
44 for i in `cat ${var}.top1.ids`
|
|
45 do
|
|
46 echo "fetching top hits..." $i;
|
|
47 grep $i $file | head -1 >> ${var}.top1Hits.txt;
|
|
48 done
|
|
49
|
|
50 #STEP1: modify the columns of Galaxy Australia (GA) blast output to the expected format by the BlastTools.jar tool
|
|
51 ###### namely: qseqid sgi sacc length pident mismatch gapopen qstart qend qlen sstart send slen sstrand evalue bitscore qcovhsp stitle staxids qseq sseq sseqid qcovs qframe sframe
|
|
52 cat ${var}.top1Hits.txt |csvtk cut -H -t -f 1,19,20,4,3,5,6,7,8,17,9,10,18,22,11,12,24,21,25,15,16,2,23,13,14 | sed 's/ /_/g' > ${var}.txt
|
|
53
|
|
54 #STEP2: summarise the GA blastN files
|
|
55 java -jar $2 -t blastn ${var}.txt
|
|
56 #filter virus/viroid/endo
|
|
57 cat summary_${var}.txt | grep "virus\|viroid\|endo" > summary_${var}_filtered.txt
|
|
58
|
|
59 #STEP3: fetch unique names from Blast summary reports
|
|
60 cat summary_${var}_filtered.txt | awk '{print $7}' | awk -F "|" '{print $2}'| sort | uniq | sed 's/Species://' > ${var}_uniq.ids
|
|
61
|
|
62 #STEP4: retrieve the best hit for each virus/viroid
|
|
63 echo "processing top hits ..."
|
|
64 touch ${var}_filtered.txt
|
|
65 for id in `cat ${var}_uniq.ids`
|
|
66 do
|
|
67 #print on the screen the name of the virus/viroids to search
|
|
68 #echo "fetching species matches ..." $id
|
|
69
|
|
70 #fetch the virus name on the summary_blastn file by selecting the longest alignment (column 3) and highest genome coverage (column 5)
|
|
71 grep $id summary_${var}.txt | sort -k3,3nr -k5,5nr | head -1 >> ${var}_filtered.txt
|
|
72 done
|
|
73
|
|
74 #print the header of the inital summary_blastn file
|
|
75 cat summary_${var}.txt | head -1 > header
|
|
76 #report 1
|
2
|
77 echo -n "#" > $3
|
|
78 cat header ${var}_filtered.txt >> $3
|
0
|
79
|
|
80 #end raw]]></configfile>
|
|
81 </configfiles>
|
|
82 <inputs>
|
|
83 <param name="blastn_search_outputs" type="data" optional="false" label="blastn_search_outputs" help="" format="tabular" multiple="false"/>
|
|
84 </inputs>
|
|
85 <outputs>
|
1
|
86 <data name="summary_viruses_viroids" format="tabular" label="summary_viruses_viroids" hidden="false"/>
|
0
|
87 </outputs>
|
|
88 <tests>
|
|
89 <test>
|
|
90 <output name="summary_viruses_viroids" value="summary_viruses_viroids_sample" compare="diff" lines_diff="0"/>
|
|
91 <param name="blastn_search_outputs" value="blastn_search_outputs_sample"/>
|
|
92 </test>
|
|
93 </tests>
|
|
94 <help><![CDATA[
|
|
95
|
|
96 **What it Does**
|
|
97
|
|
98 Wraps https://github.com/schmidda/blast-tools/tree/master as a Galaxy tool as a demonstration for Roberto Barrero
|
|
99
|
2
|
100
|
0
|
101
|
|
102 ------
|
|
103
|
|
104
|
|
105 Script::
|
|
106
|
|
107 ## eResearch Office, QUT
|
|
108 ## Created: 31 March 2021
|
|
109 ## Last modified: 28 September 2022
|
|
110 ## Script: Processes Galaxy Australia generated blastN outputs to summarise and report hits to REGULATED and ENDEMIC viruses/viroids.
|
|
111 ## Usage: ./run_VirReport_Summary.sh
|
|
112 ## changed to accept a single input file name passed as $1
|
|
113 ## Ross Lazarus for a ToolFactory wrapper for Robert Barrero
|
|
114 ## July 18 2023
|
|
115 dataPath=${PWD}
|
|
116 # Requirement: One or more GA-VSD .tabular outputs need to be in the folder where the command above (Usage)is executed.
|
|
117 # The script will Look for all files with the suffix *.tabular
|
|
118 #Processing tabular files
|
|
119 file=$1
|
|
120 var=$(basename $file)
|
|
121 #STEP0: fetch Top 1 Hits
|
|
122 cat $file | awk '{print $1}' | sort | uniq > ${var}.top1.ids
|
|
123 for i in `cat ${var}.top1.ids`
|
|
124 do
|
|
125 echo "fetching top hits..." $i;
|
|
126 grep $i $file | head -1 >> ${var}.top1Hits.txt;
|
|
127 done
|
|
128 #STEP1: modify the columns of Galaxy Australia (GA) blast output to the expected format by the BlastTools.jar tool
|
|
129 ###### namely: qseqid sgi sacc length pident mismatch gapopen qstart qend qlen sstart send slen sstrand evalue bitscore qcovhsp stitle staxids qseq sseq sseqid qcovs qframe sframe
|
|
130 cat ${var}.top1Hits.txt |csvtk cut -H -t -f 1,19,20,4,3,5,6,7,8,17,9,10,18,22,11,12,24,21,25,15,16,2,23,13,14 | sed 's/ /_/g' > ${var}.txt
|
|
131 #STEP2: summarise the GA blastN files
|
|
132 #java -jar /mnt/c/Users/lelwala/HTS/BlastTools.jar -t blastn ${var}.txt
|
|
133 java -jar $3 -t blastn ${var}.txt
|
|
134 #filter virus/viroid/endo
|
|
135 cat summary_${var}.txt | grep "virus\|viroid\|endo" > summary_${var}_filtered.txt
|
|
136 #STEP3: fetch unique names from Blast summary reports
|
|
137 cat summary_${var}_filtered.txt | awk '{print $7}' | awk -F "|" '{print $2}'| sort | uniq | sed 's/Species://' > ${var}_uniq.ids
|
|
138 #STEP4: retrieve the best hit for each virus/viroid
|
|
139 echo "processing top hits ..."
|
|
140 touch ${var}_filtered.txt
|
|
141 for id in `cat ${var}_uniq.ids`
|
|
142 do
|
|
143 #print on the screen the name of the virus/viroids to search
|
|
144 #echo "fetching species matches ..." $id
|
|
145 #fetch the virus name on the summary_blastn file by selecting the longest alignment (column 3) and highest genome coverage (column 5)
|
|
146 grep $id summary_${var}.txt | sort -k3,3nr -k5,5nr | head -1 >> ${var}_filtered.txt
|
|
147 done
|
|
148 #print the header of the inital summary_blastn file
|
|
149 cat summary_${var}.txt | head -1 > header
|
|
150 #report 1
|
|
151 cat header ${var}_filtered.txt > $2
|
|
152 #removing intermediate files
|
|
153 rm summary_${file}.txt ${file}.txt ${file}.top1.ids ${file}_uniq.ids summary_${file}_filtered.txt header* ${var}_filtered.txt *top1Hits.txt
|
|
154
|
|
155 ]]></help>
|
|
156 <citations>
|
|
157 <citation type="doi">10.1093/bioinformatics/bts573</citation>
|
|
158 </citations>
|
|
159 </tool>
|
|
160
|