0
|
1 #!/bin/bash
|
|
2
|
|
3 ## eResearch Office, QUT
|
|
4 ## Created: 31 March 2021
|
|
5 ## Last modified: 24 May 2021
|
|
6 ## Script: Processes Galaxy Australia generated blastN outputs to summarise and report hits to REGULATED and ENDEMIC viruses/viroids.
|
|
7 ## Usage: ./run_VSD_report.sh
|
|
8
|
|
9 dataPath=${PWD}
|
|
10
|
|
11 # Requirement: One or more GA-VSD .tabular outputs need to be in the folder where the command above (Usage)is executed.
|
|
12 # The script will Look for all files with the suffix *.tabular
|
|
13
|
|
14 # Help information to user (i.e., script_name -h or script_name --help)
|
|
15
|
|
16 #Required file in the same folder of tabular outputs
|
|
17 ICTV='ICTV_taxonomy_MinIdentity_Species_20210514.tsv'
|
|
18
|
|
19
|
|
20 if [ "$1" == "-h" ]; then
|
|
21 echo "Usage: "./`basename ./$0`" "
|
|
22 exit 0
|
|
23
|
|
24 elif [ "$1" == "--help" ]
|
|
25 then
|
|
26 echo "Usage: "./`basename $0`" "
|
|
27 exit 1
|
|
28 fi
|
|
29
|
|
30 #Processing tabular files
|
|
31
|
|
32 for file in *.tabular
|
|
33 do
|
|
34 var=$(basename $file)
|
|
35
|
|
36 #STEP0: fetch Top 1 Hits
|
|
37 cat $file | awk '{print $1}' | sort | uniq > ${var}.top1.ids
|
|
38 for i in `cat ${var}.top1.ids`; do echo "fetching top hits..." $i; grep $i $file | head -1 >> ${var}.top1Hits.txt ; done
|
|
39
|
|
40 #STEP1: modify the columns of Galaxy Australia (GA) blast output to the expected format by the BlastTools.jar tool
|
|
41 ###### namely: qseqid sgi sacc length pident mismatch gapopen qstart qend qlen sstart send slen sstrand evalue bitscore qcovhsp stitle staxids qseq sseq sseqid qcovs qframe sframe
|
|
42 cat ${var}.top1Hits.txt |csvtk cut -H -t -f 1,19,20,4,3,5,6,7,8,17,9,10,18,22,11,12,24,21,25,15,16,2,23,13,14 | sed 's/ /_/g' > ${var}.txt
|
|
43
|
|
44 #STEP2: summarise the GA blastN files
|
|
45 java -jar /mnt/c/Users/lelwala/HTS/BlastTools.jar -t blastn ${var}.txt
|
|
46
|
|
47 #filter regulated/edemic/LandPlant
|
|
48 cat summary_${var}.txt | grep "regulated" >> summary_${var}_filtered.txt
|
|
49 cat summary_${var}.txt | grep "endemic" >> summary_${var}_filtered.txt
|
|
50 cat summary_${var}.txt | grep "LandPlant" >> summary_${var}_filtered.txt
|
|
51
|
|
52 #STEP3: fetch unique names from Blast summary reports
|
|
53 cat summary_${var}_filtered.txt | awk '{print $7}' | awk -F "|" '{print $3}'| sort | uniq | sed 's/Species://' > ${var}_uniq.ids
|
|
54
|
|
55 #STEP4: retrieve the best hit for each virus/viroid
|
|
56 echo "processing top hits ..."
|
|
57 for id in `cat ${var}_uniq.ids`
|
|
58 do
|
|
59 #print on the screen the name of the virus/viroids to search
|
|
60 #echo "fetching species matches ..." $id
|
|
61
|
|
62 #fetch the virus name on the summary_blastn file by selecteing longest alignment (column 3) and highest genome coverage (column 5)
|
|
63 grep $id summary_${var}.txt | sort -k3,3nr -k5,5nr | head -1 >> ${var}_filtered.txt
|
|
64
|
|
65 #print the header of the inital summary_blastn file
|
|
66 cat summary_${var}.txt | head -1 > header
|
|
67
|
|
68 #fetch hits to REGULATED and ENDEMIC viruses
|
|
69 grep "regulated" ${var}_filtered.txt > summary_${var}_REGULATED_viruses_viroids
|
|
70
|
|
71 grep "endemic" ${var}_filtered.txt > summary_${var}_ENDEMIC_viruses_viroids
|
|
72
|
|
73 ##### REPORT1 ##### add header to columns
|
|
74 cat header summary_${var}_REGULATED_viruses_viroids > summary_${var}_REGULATED_viruses_viroids.txt
|
|
75
|
|
76 cat header summary_${var}_ENDEMIC_viruses_viroids > summary_${var}_ENDEMIC_viruses_viroids.txt
|
|
77
|
|
78 #fetch genus names of identified hits
|
|
79 awk '{print $7}' summary_${var}_REGULATED_viruses_viroids.txt | awk -F "|" '{print $3}' | sed 's/Species://' | sed 1d > wanted_regulated.names
|
|
80
|
|
81 awk '{print $7}' summary_${var}_ENDEMIC_viruses_viroids.txt | awk -F "|" '{print $3}' | sed 's/Species://' | sed 1d > wanted_endemic.names
|
|
82
|
|
83 #add species to report
|
|
84 paste wanted_regulated.names summary_${var}_REGULATED_viruses_viroids > summary_${var}_REGULATED_viruses_viroids.MOD
|
|
85
|
|
86 paste wanted_endemic.names summary_${var}_ENDEMIC_viruses_viroids > summary_${var}_ENDEMIC_viruses_viroids.MOD
|
|
87
|
|
88 #STEP5: fecth ICTV information
|
|
89 grep -w -F -f wanted_regulated.names $ICTV > wanted_regulated.ICTV
|
|
90
|
|
91 grep -w -F -f wanted_endemic.names $ICTV > wanted_endemic.ICTV
|
|
92
|
|
93 #join reports with ICTV information
|
|
94 join -a 1 -1 1 -2 1 summary_${var}_REGULATED_viruses_viroids.MOD wanted_regulated.ICTV | tr ' ' '\t' | awk '$4>=70' > summary_${var}_REGULATED_viruses_viroids_ICTV
|
|
95
|
|
96 #print name of virus/viroid being processed
|
|
97 echo "$id"
|
|
98
|
|
99 join -a 1 -1 1 -2 1 summary_${var}_ENDEMIC_viruses_viroids.MOD wanted_endemic.ICTV | tr ' ' '\t' | awk '$4>=70' > summary_${var}_ENDEMIC_viruses_viroids_ICTV
|
|
100
|
|
101 #modify header
|
|
102 awk '{print "Species" "\t" $0 "\t" "ICTV_information"}' header > header2
|
|
103
|
|
104 ##### REPORT2 ##### add header2 to identified hits
|
|
105 cat header2 summary_${var}_REGULATED_viruses_viroids_ICTV > summary_${var}_REGULATED_viruses_viroids_ICTV.txt
|
|
106
|
|
107 cat header2 summary_${var}_ENDEMIC_viruses_viroids_ICTV | awk -F"\t" '$1!=""&&$2!=""&&$3!=""' > summary_${var}_ENDEMIC_viruses_viroids_ICTV.txt
|
|
108
|
|
109 done
|
|
110
|
|
111 echo "completed!"
|
|
112
|
|
113 #removing intermediate files
|
|
114 rm ${var}.txt ${var}_uniq.ids summary_${var}_filtered.txt *top1Hits.txt *viruses_viroids.txt header* *.MOD *ENDEMIC_viruses_viroids *_ICTV wanted* ${var}_filtered.txt ${var}.top1.ids summary_${var}_REGULATED_viruses_viroids
|
|
115
|
|
116 done
|