changeset 0:03e20b2ddead draft default tip

Uploaded
author lelwala
date Wed, 11 Aug 2021 00:23:59 +0000
parents
children
files mnt/c/Users/lelwala/HTS/myTools/HTS_script/GA_report.tar.gz mnt/c/Users/lelwala/HTS/myTools/HTS_script/Run_VSD.xml mnt/c/Users/lelwala/HTS/myTools/HTS_script/run_VSD_report_20210604.sh
diffstat 2 files changed, 129 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mnt/c/Users/lelwala/HTS/myTools/HTS_script/Run_VSD.xml	Wed Aug 11 00:23:59 2021 +0000
@@ -0,0 +1,13 @@
+<tool id="VSD_report" name="Summarise GA outputs" version="0.1.0">
+    <description>
+    </requirements>
+    <command>./run_VSD_report_20210604.sh</command>
+    <inputs>
+		<param type="data" format="tabular"/>
+    </inputs>
+    <outputs>
+    </outputs>
+    <help>
+	This tool generates summary reports fron GA blastn outputs
+	</help>
+</tool>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mnt/c/Users/lelwala/HTS/myTools/HTS_script/run_VSD_report_20210604.sh	Wed Aug 11 00:23:59 2021 +0000
@@ -0,0 +1,116 @@
+#!/bin/bash
+
+## eResearch Office, QUT
+## Created:  31 March 2021
+## Last modified: 24 May 2021
+## Script: Processes Galaxy Australia generated blastN outputs to summarise and report hits to REGULATED and ENDEMIC viruses/viroids.
+## Usage: ./run_VSD_report.sh
+
+dataPath=${PWD}
+
+# Requirement: One or more GA-VSD .tabular outputs need to be in the folder where the command above (Usage)is executed.
+# The script will Look for all files with the suffix *.tabular
+
+# Help information to user (i.e., script_name -h or script_name --help)
+
+#Required file in the same folder of tabular outputs
+ICTV='ICTV_taxonomy_MinIdentity_Species_20210514.tsv'
+
+
+if [ "$1" == "-h" ]; then
+  echo "Usage: "./`basename ./$0`" "
+  exit 0
+
+elif [ "$1" == "--help" ]
+  then
+    echo "Usage: "./`basename $0`" "
+    exit 1
+fi
+
+#Processing tabular files
+
+for file in *.tabular
+   do
+      var=$(basename $file)
+      
+      #STEP0: fetch Top 1 Hits
+      cat $file | awk '{print $1}' | sort | uniq > ${var}.top1.ids
+      for i in `cat ${var}.top1.ids`; do echo "fetching top hits..." $i; grep $i $file | head -1 >> ${var}.top1Hits.txt ; done
+
+      #STEP1: modify the columns of Galaxy Australia (GA) blast output to the expected format by the BlastTools.jar tool
+      ######  namely: qseqid sgi sacc length pident mismatch gapopen qstart qend qlen sstart send slen sstrand evalue bitscore qcovhsp stitle staxids qseq sseq sseqid qcovs qframe sframe
+      cat ${var}.top1Hits.txt |csvtk cut -H -t -f 1,19,20,4,3,5,6,7,8,17,9,10,18,22,11,12,24,21,25,15,16,2,23,13,14 | sed 's/ /_/g' > ${var}.txt 
+      
+      #STEP2: summarise the GA blastN files
+      java -jar /mnt/c/Users/lelwala/HTS/BlastTools.jar -t blastn ${var}.txt 
+
+      #filter regulated/edemic/LandPlant
+      cat summary_${var}.txt | grep "regulated" >> summary_${var}_filtered.txt
+      cat summary_${var}.txt | grep "endemic" >> summary_${var}_filtered.txt
+      cat summary_${var}.txt | grep "LandPlant" >> summary_${var}_filtered.txt
+
+      #STEP3: fetch unique names from Blast summary reports
+      cat summary_${var}_filtered.txt | awk '{print $7}' | awk -F "|" '{print $3}'| sort | uniq | sed 's/Species://' > ${var}_uniq.ids
+
+      #STEP4: retrieve the best hit for each virus/viroid
+      echo "processing top hits ..."
+      for id in `cat ${var}_uniq.ids` 
+        do
+          #print on the screen the name of the virus/viroids to search
+          #echo "fetching species matches ..." $id
+          
+          #fetch the virus name on the summary_blastn file by selecteing longest alignment (column 3) and highest genome coverage (column 5)
+          grep $id summary_${var}.txt | sort -k3,3nr -k5,5nr | head -1 >> ${var}_filtered.txt
+
+          #print the header of the inital summary_blastn file
+          cat summary_${var}.txt | head -1 > header
+
+          #fetch hits to REGULATED and ENDEMIC viruses
+          grep "regulated" ${var}_filtered.txt > summary_${var}_REGULATED_viruses_viroids
+          
+          grep "endemic" ${var}_filtered.txt > summary_${var}_ENDEMIC_viruses_viroids
+
+          ##### REPORT1 ##### add header to columns
+          cat header summary_${var}_REGULATED_viruses_viroids > summary_${var}_REGULATED_viruses_viroids.txt
+
+          cat header summary_${var}_ENDEMIC_viruses_viroids > summary_${var}_ENDEMIC_viruses_viroids.txt
+
+          #fetch genus names of identified hits
+          awk '{print $7}' summary_${var}_REGULATED_viruses_viroids.txt | awk -F "|" '{print $3}' | sed 's/Species://' | sed 1d > wanted_regulated.names
+
+          awk '{print $7}' summary_${var}_ENDEMIC_viruses_viroids.txt | awk -F "|" '{print $3}' | sed 's/Species://' | sed 1d > wanted_endemic.names
+
+          #add species to report
+          paste wanted_regulated.names summary_${var}_REGULATED_viruses_viroids > summary_${var}_REGULATED_viruses_viroids.MOD
+          
+          paste wanted_endemic.names summary_${var}_ENDEMIC_viruses_viroids > summary_${var}_ENDEMIC_viruses_viroids.MOD
+
+          #STEP5: fecth ICTV information
+          grep -w -F -f wanted_regulated.names $ICTV > wanted_regulated.ICTV
+          
+          grep -w -F -f wanted_endemic.names $ICTV > wanted_endemic.ICTV
+
+          #join reports with ICTV information
+          join -a 1 -1 1 -2 1 summary_${var}_REGULATED_viruses_viroids.MOD  wanted_regulated.ICTV | tr ' ' '\t' | awk '$4>=70' >  summary_${var}_REGULATED_viruses_viroids_ICTV
+  
+          #print name of virus/viroid being processed
+          echo "$id"
+ 
+          join -a 1 -1 1 -2 1 summary_${var}_ENDEMIC_viruses_viroids.MOD  wanted_endemic.ICTV | tr ' ' '\t' | awk '$4>=70' > summary_${var}_ENDEMIC_viruses_viroids_ICTV   
+
+          #modify header
+          awk '{print "Species" "\t" $0 "\t" "ICTV_information"}' header > header2
+
+          ##### REPORT2 ##### add header2 to identified hits
+          cat header2 summary_${var}_REGULATED_viruses_viroids_ICTV > summary_${var}_REGULATED_viruses_viroids_ICTV.txt
+
+          cat header2 summary_${var}_ENDEMIC_viruses_viroids_ICTV | awk -F"\t" '$1!=""&&$2!=""&&$3!=""' > summary_${var}_ENDEMIC_viruses_viroids_ICTV.txt
+         
+      done
+
+echo "completed!"
+
+#removing intermediate files
+rm ${var}.txt ${var}_uniq.ids summary_${var}_filtered.txt  *top1Hits.txt *viruses_viroids.txt header* *.MOD *ENDEMIC_viruses_viroids *_ICTV  wanted* ${var}_filtered.txt ${var}.top1.ids summary_${var}_REGULATED_viruses_viroids
+
+done