Repository 'tapscan'
hg clone https://toolshed.g2.bx.psu.edu/repos/bgruening/tapscan

Changeset 1:c4f865bd101a (2024-02-22)
Previous changeset 0:196795831b6a (2024-02-14)
Commit message:
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/tapscan commit af8605d266717ed3453bd2a0947fed79c7098fb3
modified:
tapscan.xml
test-data/output.2.tsv
test-data/output.domtbl.tsv
added:
tapscan_classify.pl
tapscan_coverage_values_v11.txt
tapscan_domains_v13.txt.gz
tapscan_rules_v82.txt
removed:
tapscan_coverage_values_v10.txt
tapscan_domains_v12.txt.gz
tapscan_rules_v81.txt
tapscan_script_v74.pl
b
diff -r 196795831b6a -r c4f865bd101a tapscan.xml
--- a/tapscan.xml Wed Feb 14 13:54:16 2024 +0000
+++ b/tapscan.xml Thu Feb 22 10:07:53 2024 +0000
[
@@ -1,4 +1,4 @@
-<tool id="tapscan_classify" name="TAPScan Classify" version="4.74+galaxy0" profile="23.0">
+<tool id="tapscan_classify" name="TAPScan Classify" version="4.76+galaxy0" profile="23.0">
     <description>Detect Transcription Associated Proteins (TAPs)</description>
     <edam_topics>
         <edam_topic>topic_0121</edam_topic>
@@ -9,28 +9,28 @@
         <requirement type="package" version="4.8">sed</requirement>
     </requirements>
     <required_files>
-        <include type="literal" path="tapscan_script_v74.pl"/>
-        <include type="literal" path="tapscan_domains_v12.txt"/>
-        <include type="literal" path="tapscan_rules_v81.txt"/>
-        <include type="literal" path="tapscan_coverage_values_v10.txt"/>
+        <include type="literal" path="tapscan_classify.pl"/>
+        <include type="literal" path="tapscan_domains_v13.txt.gz"/>
+        <include type="literal" path="tapscan_rules_v82.txt"/>
+        <include type="literal" path="tapscan_coverage_values_v11.txt"/>
     </required_files>
     <command detect_errors="aggressive"><![CDATA[
 
 hmmsearch
   --domtblout domtblout.txt
   --cut_ga
-  '${__tool_directory__}/tapscan_domains_v12.txt.gz'
+  '${__tool_directory__}/tapscan_domains_v13.txt.gz'
   '$protein_fasta_in'
 
 &&
 
-perl '${__tool_directory__}/tapscan_script_v74.pl'
+perl '${__tool_directory__}/tapscan_classify.pl'
   domtblout.txt
-  '${__tool_directory__}/tapscan_rules_v81.txt'
+  '${__tool_directory__}/tapscan_rules_v82.txt'
   '$taps_detected'
   '$taps_family_counts'
   '$taps_detected_extra'
-  '${__tool_directory__}/tapscan_coverage_values_v10.txt'
+  '${__tool_directory__}/tapscan_coverage_values_v11.txt'
 
 &&
 
b
diff -r 196795831b6a -r c4f865bd101a tapscan_classify.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tapscan_classify.pl Thu Feb 22 10:07:53 2024 +0000
[
b'@@ -0,0 +1,932 @@\n+#!/usr/bin/perl\n+use strict;\n+use warnings;\n+use File::Basename;\n+\n+my $tapscan_version = "v4.76";\n+print "Running TAPscan Classify version $tapscan_version \\n\\n";\n+\n+\n+# Written by Gerrit Timmerhaus (gerrit.timmerhaus@biologie.uni-freiburg.de).\n+# Changes included by Kristian Ullrich, Per Wilhelmsson, Romy Petroll and Saskia Hiltemann.\n+\n+# Script to extract all detected domains out of a hmmsearch results file and classify the families of all used proteins based on these domains.\n+# The classification depends on a table which contains all known classification rules for the protein families of interest and on specific coverage values defined for every domain.\n+# The script provides three outputs, namely output.1, output.2 and output.3. The output files are tables in ";"-delimited format.\n+# The structure of output.1 is: "sequence ID ; TAP family ; number of classifications ; domains".\n+# Output.3 shares in principle the same structure as output.1, except that subfamilies are considered. ("sequence ID ; TAP family ; Subfamily ; number of classifications ; domains")\n+# The superior TAP family is specified first, followed by the subfamily. If a TAP family has no subfamily, the TAP family is specified first and then a "-".\n+# The structure of output.2 is: "TAP family";"number of detected proteins".\n+# More than one entry for a protein is possible because the classification rules may allow more than one classification.\n+#\n+# The script must be startet with the arguments <hmmsearch output file> <classification rules> <output classifications file> <output family statistics file> <output subfamily classifications file> <"filter" if desired>\n+\n+if (!@ARGV or ($ARGV [0] eq "-h") or ($ARGV [0] eq "-help")) {\n+\tprint "Usage: extract.and.classify.pl <hmmsearch output file> <classification rules> <output classifications file> <output family statistics file> <output subfamily classifications file> <\\"filter\\" (if desired)>\\n\\n";\n+\texit;\n+}\n+\n+# hmmsearch_output: domtblout file\n+my $hmmsearch_output = $ARGV [0];\n+# decision_table: rules file\n+my $decision_table = $ARGV [1];\n+# family_classifications: output.1\n+my $family_classifications = $ARGV [2];\n+# family_statistics: output.2\n+my $family_statistics = $ARGV [3];\n+# subfamily_classifications: output.3\n+my $subfamily_classifications = $ARGV [4];\n+# domspec_cuts: coverage values file\n+my $domspec_cuts = $ARGV [5];\n+# gene_model_filte: filter for ARATH and ORYSA\n+my $gene_model_filter = $ARGV [6];\n+\n+# get basename for output files\n+my($basename, $dirs, $suffix) = fileparse($hmmsearch_output, qr/\\.[^.]*/);\n+\n+if ($family_statistics eq "") {\n+\tprint "Usage: extract.and.classify.pl <hmmsearch output file> <classification rules> <output classifications file> <output family statistics file> <output subfamil classifications file> <\\"filter\\" (if desired)>\\n\\n";\n+\texit;\n+}\n+\n+if ($gene_model_filter and $gene_model_filter eq "filter") {\n+\tprint "\\nGene model filter is activated. It only works for TAIR (Arabidopsis) and TIGR (Rice) proteins up to now\\n";\n+}\n+\n+# Array where the $hmmsearch-output/domtblout file will be stored\n+my @output = ();\n+# Array with domain-specific coverage values\n+my @cuts = ();\n+# Array with rules\n+my @dec_table = ();\n+# Counter for the number of detected domains in the hmmsearch output file\n+my $entry_counter = 0;\n+# Containes the actual result for a query sequence\n+my $akt_entry = "";\n+# Used to define query entry to ignore similar domains\n+my $whole_entry = "";\n+# Includes the final entries after ignoring similar domains\n+my @results_of_extraction = ();\n+# Used to define query entry to ignore similar domains\n+my $extracted_domain = "";\n+# Used to define query entry to ignore similar domains\n+my $present = "";\n+# Used to define query entry to ignore similar domains\n+my $protein = "";\n+\n+my $lek = "";\n+\n+############################################\n+### 1. Read in the hmmsearch output file ###\n+############################################\n+\n+print "\\n*** '..b'milies into $family_list to create output.2\n+foreach my $fcf_line (@family_classifications_file) {\n+\t$fcf_line =~ /^[^;]+;([^;]+)/;\n+\t#push @family_list, "$1";\n+\tprint FAMILY_CLASSIFICATIONS "$fcf_line";\n+}\n+close (FAMILY_CLASSIFICATIONS);\n+\n+foreach my $fcf_line (@subfamily_classifications_file) {\n+\t$fcf_line =~ /^[^;]+;([^;]+);([^;]+)/;\n+\tif ($2 eq "-") {\n+\t\tpush @family_list, "$1";\n+\t}\n+\telse {\n+\t\tpush @family_list, "$2";\n+\t}\n+\tprint SUBFAMILY_CLASSIFICATIONS "$fcf_line";\n+}\n+\n+close (SUBFAMILY_CLASSIFICATIONS);\n+\n+print "*** calculating the family statistics and write it in $family_statistics ***\\n\\n";\n+\n+##################################\n+### 5. Create the output files ###\n+##################################\n+\n+my $statistics_outputfile = "$family_statistics";\n+\n+unless (open(FAMILY_STATISTICS, ">$statistics_outputfile")) {\n+\tprint "Cannot open file \\"$statistics_outputfile\\" to write to!!\\n\\n";\n+\texit;\n+}\n+\n+# Count the family entries\n+my @output_family_statistics = ();\n+my @gefundene_familien = ();\n+my $family_counter = 1;\n+\n+shift @family_list;\n+@family_list = sort @family_list;\n+\n+\n+my $old_family = "";\n+push @family_list, \'BAD FIX\'; # Makes to loop go through every fam and stops at non fam(BAD FIX).\n+\n+foreach my $line (@family_list) {\n+\tif ($line eq $old_family) {\n+\t\t$family_counter++;\n+\t}\n+\telsif ($old_family ne "") {\n+\t\tpush (@output_family_statistics,"$old_family;$family_counter\\n");\n+\t\t# Add all found families to a list of found families\n+\t\tpush (@gefundene_familien,"$old_family");\n+\t        $family_counter=1;\n+\t}\n+\t$old_family = $line;\n+}\n+\n+my %hash = ();\n+\n+# Put all families from the classifictaion ruled and all found families in a hash\n+foreach my $element (@gefundene_familien,@liste_alle_familien) {$hash{$element}++;}\n+\n+# Remove merged families\n+delete $hash{\'GARP_ARR-B_Myb\'};\n+delete $hash{\'GARP_ARR-B_G2\'};\n+delete $hash{\'bZIP1\'};\n+delete $hash{\'bZIP2\'};\n+delete $hash{\'bZIPAUREO\'};\n+delete $hash{\'bZIPCDD\'};\n+delete $hash{\'HRT\'};\n+delete $hash{\'GIY_YIG\'};\n+\n+# Add all not found families from the classification rules to @output_family_statistics\n+# With zero as number of families found\n+\n+foreach my $element (keys %hash) {\n+\tif (($hash{$element} == 1) and ( $element eq "0_no_family_found")) {\n+\tpush (@output_family_statistics,"$element;$unclassified_families\\n");\n+\t}\n+\tif (($hash{$element} == 1) and ($element ne "0_no_family_found")) {\n+\tpush (@output_family_statistics,"$element;0\\n");\n+\t}\n+}\n+\n+# Sort @output_family_statistics caseinsensitive-alphabetically\n+my @sortierte_statistik = sort {lc $a cmp lc $b} @output_family_statistics;\n+\n+# Print headline to @sortierte_statistik\n+unshift (@sortierte_statistik,"family statistics for $hmmsearch_output\\n");\n+\n+print FAMILY_STATISTICS @sortierte_statistik;\n+\n+# Print FAMILY_STATISTICS "$old_family;$family_counter\\n";\n+\n+close (FAMILY_STATISTICS);\n+\n+#################################################\n+### 6. Give out some statistical informations ###\n+#################################################\n+\n+$entry_counter [0] = 0;\n+my $sum = 0;\n+foreach my $entry (@entry_counter) {\n+\t$sum += $entry;\n+}\n+print "$classified_families classifications were found for $sum proteins.\\n";\n+print "This classifications are divided in:\\n";\n+my $count = 0;\n+foreach my $element (@entry_counter) {\n+\tif ($count != 0) {\n+\t\tprint "$element proteins were classified for $count";\n+\t\tif ($count == 1) {print " family\\n";}\n+\t\telse {print " different families\\n";}\n+\t}\n+\t$count++;\n+}\n+print "\\n$unclassified_families proteins could not be classified\\n\\n";\n+\n+print "*** The results were written in $family_classifications and $subfamily_classifications ***\\n";\n+print "*** done ***\\n\\n";\n+\n+\n+exit;\n+\n+sub get_file_data {\n+\n+\tmy ($filename) = @_;\n+\n+\tuse strict;\n+\tuse warnings;\n+\n+\tmy @filedata = ();\n+\n+\tunless( open(GET_FILE_DATA, $filename)) {\n+\t\tprint STDERR "Cannot open file \\"$filename\\"n\\n";\n+\t\texit;\n+\t}\n+\n+\t@filedata = <GET_FILE_DATA>;\n+\n+\tclose GET_FILE_DATA;\n+\n+\treturn @filedata;\n+}\n+\n+\n'
b
diff -r 196795831b6a -r c4f865bd101a tapscan_coverage_values_v10.txt
--- a/tapscan_coverage_values_v10.txt Wed Feb 14 13:54:16 2024 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,154 +0,0 @@
-Acetyltransf_1 0.0931034483
-AP2 0.3214285714
-ARID 0.0516393443
-AUX_IAA 0.1077981651
-Auxin_resp 0.4308510638
-B3 0.196875
-BES1_N 0.3888888889
-BSD 0.485915493
-BTB 0.3105095541
-bZIP_1 0.64453125
-bZIP_2 0.5948275862
-C1_2 0.1612903226
-CAF1C_H4-bd 0.1764705882
-CBFB_NFYA 0.2019230769
-CCT 0.4952830189
-CG-1 0.5568181818
-CSD 0.5878378378
-DDT 0.2678571429
-DEAD 0.1115591398
-dsrm 0.2330097087
-DUF260 0.3638059701
-DUF296 0.155075188
-DUF547 0.1181672026
-DUF573 0.3515625
-DUF632 0.3302603037
-DUF702 0.2314126394
-E2F_TDP 0.2078313253
-EIN3 0.5874125874
-FHA 0.1777456647
-FLO_LFY 0.4569138277
-FYRC 0.3928571429
-FYRN 0.46875
-GAGA_bind 0.2265774379
-GATA 0.5406976744
-GRAS 0.3990825688
-Helicase_C 0.118852459
-HLH 0.3191489362
-HMG_box 0.6866197183
-Homeobox 0.5485074627
-HSF_DNA-bind 0.071641791
-IQ 0.6428571429
-JmjC 0.5058139535
-JmjN 0.4261363636
-K-box 0.438
-KNOX1 0.578125
-KNOX2 0.6346153846
-LIM 0.4726027397
-MBF1 0.3049450549
-Med26 0.1939655172
-Med31 0.451048951
-Med6 0.1577868852
-Med7 0.1818181818
-MEKHLA 0.4357541899
-mTERF 0.4850543478
-Myb_DNA-binding 0.3636363636
-NAM 0.0142857143
-O-FucT 0.0914866582
-Ovate 0.3137755102
-PAH 0.1194267516
-PAZ 0.1564569536
-PC4 0.3848684211
-PHD 0.37
-Piwi 0.4082278481
-PLATZ 0.30625
-PP2C 0.4110962567
-QLQ 0.61875
-RB_B 0.1663987138
-Rcd1 0.4131355932
-Response_reg 0.5017241379
-RFX_DNA_binding 0.5257731959
-RHD_DNA_bind 0.4581447964
-Ribonuclease_3 0.0935013263
-RRN3 0.2342427093
-Runt 0.5714285714
-RWP-RK 0.4542253521
-S1FA 0.7386363636
-SBP 0.4274193548
-SET 0.0392857143
-SH2 0.4197247706
-Sigma70_r2 0.1675531915
-Sigma70_r3 0.6346153846
-Sigma70_r4 0.6388888889
-SIR2 0.45703125
-SNF2_N 0.1150895141
-SRF-TF 0.406779661
-SSXT 0.5856164384
-START 0.5067567568
-STAT_bind 0.3781869688
-SWIB 0.3267326733
-SWIRM 0.2532467532
-TANGO2 0.125
-TCP 0.219665272
-TCR 0.4943181818
-TEA 0.4078282828
-TF_AP-2 0.5660377358
-Tfb2 0.1757668712
-tify 0.5955882353
-Tub 0.0872576177
-VEFS-Box 0.4918831169
-WD40 0.0563909774
-WHIM1 0.5263157895
-Whirly 0.5648148148
-WRC 0.6223404255
-WRKY 0.2196969697
-WSD 0.1768867925
-YABBY 0.2804621849
-zf-AN1 0.4027777778
-zf-B_box 0.2746478873
-zf-C2H2 0.5080645161
-zf-C5HC2 0.253125
-zf-CCCH 0.5689655172
-zf-Dof 0.5955882353
-ZF-HD_dimer 0.5294117647
-zf-MIZ 0.6805555556
-zf-TAZ 0.1566455696
-zf-ZPR1 0.1608910891
-Zn_clus 0.5480769231
-Alfin-like 0.75
-BEL 0.75
-DNC 0.75
-FIE_clipped_for_HMM 0.75
-G2-like_Domain 0.75
-HRT 0.75
-KNOXC 0.75
-LUFS_Domain 0.75
-NF-YB 0.75
-NF-YC 0.75
-NOZZLE 0.75
-PINTOX 0.75
-STER_AP 0.75
-trihelix 0.75
-ULT_Domain 0.75
-VARL 0.75
-VOZ_Domain 0.75
-WOX_HD 0.75
-CXC 0.75
-bZIP_AUREO 0.75
-bZIP_CDD 0.50
-ALOG 0.75
-C2H2-IDD 0.75
-zf-MYST 0.75
-CBP 0.75
-DUF3591 0.75
-LOB2 0.75
-zz-ADA2 0.75
-NLP 0.75
-CRF 0.75
-GIY_YIG 0.75
-ZPR 0.75
-LD 0.75
-NDX 0.75
-SAWADEE 0.75
-C1HDZ 0.75
-C2HDZ 0.75
b
diff -r 196795831b6a -r c4f865bd101a tapscan_coverage_values_v11.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tapscan_coverage_values_v11.txt Thu Feb 22 10:07:53 2024 +0000
b
@@ -0,0 +1,155 @@
+Acetyltransf_1 0.0931034483
+AP2 0.3214285714
+ARID 0.0516393443
+AUX_IAA 0.1077981651
+Auxin_resp 0.4308510638
+B3 0.196875
+BES1_N 0.3888888889
+BSD 0.485915493
+BTB 0.3105095541
+bZIP_1 0.64453125
+bZIP_2 0.5948275862
+C1_2 0.1612903226
+CAF1C_H4-bd 0.1764705882
+CBFB_NFYA 0.2019230769
+CCT 0.4952830189
+CG-1 0.5568181818
+CSD 0.5878378378
+DDT 0.2678571429
+DEAD 0.1115591398
+dsrm 0.2330097087
+DUF260 0.3638059701
+DUF296 0.155075188
+DUF547 0.1181672026
+DUF573 0.3515625
+DUF632 0.3302603037
+DUF702 0.2314126394
+E2F_TDP 0.2078313253
+EIN3 0.5874125874
+FHA 0.1777456647
+FLO_LFY 0.4569138277
+FYRC 0.3928571429
+FYRN 0.46875
+GAGA_bind 0.2265774379
+GATA 0.5406976744
+GRAS 0.3990825688
+Helicase_C 0.118852459
+HLH 0.3191489362
+HMG_box 0.6866197183
+Homeobox 0.5485074627
+HSF_DNA-bind 0.071641791
+IQ 0.6428571429
+JmjC 0.5058139535
+JmjN 0.4261363636
+K-box 0.438
+KNOX1 0.578125
+KNOX2 0.6346153846
+LIM 0.4726027397
+MBF1 0.3049450549
+Med26 0.1939655172
+Med31 0.451048951
+Med6 0.1577868852
+Med7 0.1818181818
+MEKHLA 0.4357541899
+mTERF 0.4850543478
+Myb_DNA-binding 0.3636363636
+NAM 0.0142857143
+O-FucT 0.0914866582
+Ovate 0.3137755102
+PAH 0.1194267516
+PAZ 0.1564569536
+PC4 0.3848684211
+PHD 0.37
+Piwi 0.4082278481
+PLATZ 0.30625
+PP2C 0.4110962567
+QLQ 0.61875
+RB_B 0.1663987138
+Rcd1 0.4131355932
+Response_reg 0.5017241379
+RFX_DNA_binding 0.5257731959
+RHD_DNA_bind 0.4581447964
+Ribonuclease_3 0.0935013263
+RRN3 0.2342427093
+Runt 0.5714285714
+RWP-RK 0.4542253521
+S1FA 0.7386363636
+SBP 0.4274193548
+SET 0.0392857143
+SH2 0.4197247706
+Sigma70_r2 0.1675531915
+Sigma70_r3 0.6346153846
+Sigma70_r4 0.6388888889
+SIR2 0.45703125
+SNF2_N 0.1150895141
+SRF-TF 0.406779661
+SSXT 0.5856164384
+START 0.5067567568
+STAT_bind 0.3781869688
+SWIB 0.3267326733
+SWIRM 0.2532467532
+TANGO2 0.125
+TCP 0.219665272
+TCR 0.4943181818
+TEA 0.4078282828
+TF_AP-2 0.5660377358
+Tfb2 0.1757668712
+tify 0.5955882353
+Tub 0.0872576177
+VEFS-Box 0.4918831169
+WD40 0.0563909774
+WHIM1 0.5263157895
+Whirly 0.5648148148
+WRC 0.6223404255
+WRKY 0.2196969697
+WSD 0.1768867925
+YABBY 0.2804621849
+zf-AN1 0.4027777778
+zf-B_box 0.2746478873
+zf-C2H2 0.5080645161
+zf-C5HC2 0.253125
+zf-CCCH 0.5689655172
+zf-Dof 0.5955882353
+ZF-HD_dimer 0.5294117647
+zf-MIZ 0.6805555556
+zf-TAZ 0.1566455696
+zf-ZPR1 0.1608910891
+Zn_clus 0.5480769231
+Alfin-like 0.75
+BEL 0.75
+DNC 0.75
+FIE_clipped_for_HMM 0.75
+G2-like_Domain 0.75
+HRT 0.75
+KNOXC 0.75
+LUFS_Domain 0.75
+NF-YB 0.75
+NF-YC 0.75
+NOZZLE 0.75
+PINTOX 0.75
+STER_AP 0.75
+trihelix 0.75
+ULT_Domain 0.75
+VARL 0.75
+VOZ_Domain 0.75
+WOX_HD 0.75
+CXC 0.75
+bZIP_AUREO 0.75
+bZIP_CDD 0.50
+ALOG 0.75
+C2H2-IDD 0.75
+zf-MYST 0.75
+CBP 0.75
+DUF3591 0.75
+LOB2 0.75
+zz-ADA2 0.75
+NLP 0.75
+CRF 0.75
+GIY_YIG 0.75
+ZPR 0.75
+LD 0.75
+NDX 0.75
+SAWADEE 0.75
+C1HDZ 0.75
+C2HDZ 0.75
+Homeobox_KN 0.75
b
diff -r 196795831b6a -r c4f865bd101a tapscan_domains_v12.txt.gz
b
Binary file tapscan_domains_v12.txt.gz has changed
b
diff -r 196795831b6a -r c4f865bd101a tapscan_domains_v13.txt.gz
b
Binary file tapscan_domains_v13.txt.gz has changed
b
diff -r 196795831b6a -r c4f865bd101a tapscan_rules_v81.txt
--- a/tapscan_rules_v81.txt Wed Feb 14 13:54:16 2024 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,298 +0,0 @@
-ABI3/VP1;AP2;should not
-ABI3/VP1;Auxin_resp;should not
-ABI3/VP1;B3;should
-ABI3/VP1;WRKY;should not
-Alfin-like;Alfin-like;should
-Alfin-like;Homeobox;should not
-Alfin-like;zf-TAZ;should not
-Alfin-like;PHD;should not
-AP2;AP2;should
-AP2;CRF;should not
-ARF;Auxin_resp;should
-Argonaute;Piwi;should
-Argonaute;PAZ;should
-ARID;ARID;should
-Aux/IAA;AUX_IAA;should
-Aux/IAA;Auxin_resp;should not
-Aux/IAA;B3;should not
-BBR/BPC;GAGA_bind;should
-BES1;BES1_N;should
-bHLH;HLH;should
-bHLH;TCP;should not
-bHLH_TCP;TCP;should
-bHSH;TF_AP-2;should
-BSD domain containing;BSD;should
-bZIP1;bZIP_1;should
-bZIP1;HLH;should not
-bZIP1;Homeobox;should not
-bZIP2;bZIP_2;should
-bZIP2;HLH;should not
-bZIP2;Homeobox;should not
-bZIPAUREO;bZIP_AUREO;should
-bZIPAUREO;HLH;should not
-bZIPAUREO;Homeobox;should not
-bZIPCDD;bZIP_CDD;should
-bZIPCDD;HLH;should not
-bZIPCDD;Homeobox;should not
-C2C2_CO-like;CCT;should
-C2C2_CO-like;GATA;should not
-C2C2_CO-like;tify;should not
-C2C2_CO-like;PLATZ;should not
-C2C2_CO-like;zf-B_box;should
-C2C2_Dof;zf-Dof;should
-C2C2_Dof;GATA;should not
-C2C2_GATA;GATA;should
-C2C2_GATA;tify;should not
-C2C2_GATA;zf-Dof;should not
-C2C2_YABBY;YABBY;should
-C2H2;zf-C2H2;should
-C2H2;zf-MIZ;should not
-C3H;AP2;should not
-C3H;SRF-TF;should not
-C3H;MYB-2R;should not
-C3H;MYB-3R;should not
-C3H;MYB-4R;should not
-C3H;zf-C2H2;should not
-C3H;zf-CCCH;should
-CAMTA;CG-1;should
-CAMTA;IQ;should
-NF-YA;bZIP_1;should not
-NF-YA;bZIP_2;should not
-NF-YA;CBFB_NFYA;should
-NF-YB;NF-YB;should
-NF-YB;NF-YC;should not
-NF-YC;NF-YB;should not
-NF-YC;NF-YC;should
-NF-YC;HMG_box;should not
-Coactivator p15;PC4;should
-CPP;TCR;should
-CSD;CSD;should
-CudA;STAT_bind;should
-CudA;SH2;should
-DBP;DNC;should
-DBP;PP2C;should
-DDT;DDT;should
-DDT;Homeobox;should not
-DDT;Alfin-like;should not
-Dicer;Piwi;should not
-Dicer;DEAD;should
-Dicer;Helicase_C;should
-Dicer;Ribonuclease_3;should
-Dicer;dsrm;should
-DUF246 domain containing/O-FucT;O-FucT;should
-DUF296 domain containing;DUF296;should
-DUF547 domain containing;DUF547;should
-DUF632 domain containing;DUF632;should
-DUF833 domain containing/TANGO2;TANGO2;should
-E2F/DP;E2F_TDP;should
-EIL;EIN3;should
-FHA;FHA;should
-GARP_ARR-B_G2;CCT;should not
-GARP_ARR-B_G2;G2-like_Domain;should
-GARP_ARR-B_G2;Response_reg;should
-GARP_ARR-B_Myb;CCT;should not
-GARP_ARR-B_Myb;Response_reg;should
-GARP_ARR-B_Myb;Myb_DNA-binding;should
-GARP_G2-like;G2-like_Domain;should
-GARP_G2-like;Response_reg;should not
-GARP_G2-like;Myb_DNA-binding;should not
-GeBP;DUF573;should
-GIF;SSXT;should
-GNAT;Acetyltransf_1;should
-GNAT;PHD;should not
-GRAS;GRAS;should
-GRF;QLQ;should
-GRF;WRC;should
-C3HDZ;Homeobox;should
-C3HDZ;START;should
-C3HDZ;MEKHLA;should
-C4HDZ;Homeobox;should
-C4HDZ;START;should
-C4HDZ;MEKHLA;should not
-HD_PLINC;ZF-HD_dimer;should
-HD_WOX;WOX_HD;should
-HD_DDT;Homeobox;should
-HD_DDT;DDT;should
-HD_DDT;WHIM1;should
-HD_DDT;WSD;should
-HD_PHD;PHD;should
-HD_PHD;Homeobox;should
-HD_PINTOX;Homeobox;should
-HD_PINTOX;PINTOX;should
-HD_BEL;Homeobox;should
-HD_BEL;BEL;should
-HD_KNOX1;Homeobox;should
-HD_KNOX1;KNOX1;should
-HD_KNOX1;KNOX2;should
-HD_KNOX1;KNOXC;should not
-HD_KNOX2;Homeobox;should
-HD_KNOX2;KNOX1;should
-HD_KNOX2;KNOX2;should
-HD_KNOX2;KNOXC;should
-HD-other;EIN3;should not
-HD-other;Homeobox;should
-HD-other;bZIP_1;should not
-HD-other;WOX_HD;should not
-HD-other;PINTOX;should not
-HD-other;PHD;should not
-HD-other;BEL;should not
-HMG;ARID;should not
-HMG;HMG_box;should
-HMG;YABBY;should not
-HRT;HRT;should
-HSF;HSF_DNA-bind;should
-IWS1;Med26;should
-Jumonji_PKDM7;JmjC;should
-Jumonji_PKDM7;JmjN;should
-Jumonji_PKDM7;zf-C5HC2;should
-Jumonji_PKDM7;FYRN;should
-Jumonji_PKDM7;FYRC;should
-Jumonji_Other;JmjC;should
-LFY;FLO_LFY;should
-LIM;two_or_more_LIM;should
-LUG;LUFS_Domain;should
-MADS;SRF-TF;should
-MADS;K-box;should not
-MADS_MIKC;SRF-TF;should
-MADS_MIKC;K-box;should
-MBF1;MBF1;should
-Med6;Med6;should
-Med7;Med7;should
-mTERF;mTERF;should
-MYB-2R;G2-like_Domain;should not
-MYB-2R;Response_reg;should not
-MYB-2R;trihelix;should not
-MYB-2R;MYB-2R;should
-MYB-3R;G2-like_Domain;should not
-MYB-3R;Response_reg;should not
-MYB-3R;trihelix;should not
-MYB-3R;MYB-3R;should
-MYB-4R;G2-like_Domain;should not
-MYB-4R;Response_reg;should not
-MYB-4R;trihelix;should not
-MYB-4R;MYB-4R;should
-MYB-related;ARID;should not
-MYB-related;G2-like_Domain;should not
-MYB-related;Myb_DNA-binding;should
-MYB-related;Response_reg;should not
-MYB-related;trihelix;should not
-MYB-related;MYB-2R;should not
-MYB-related;MYB-3R;should not
-MYB-related;MYB-4R;should not
-NAC;NAM;should
-NZZ;NOZZLE;should
-OFP;Ovate;should
-PcG_EZ;CXC;should
-PcG_EZ;SET;should
-PcG_FIE;FIE_clipped_for_HMM;should
-PcG_FIE;WD40;should
-PcG_VEFS;VEFS-Box;should
-PcG_VEFS;zf-C2H2;should not
-PcG_MSI;WD40;should
-PcG_MSI;CAF1C_H4-bd;should
-PcG_MSI;FIE_clipped_for_HMM;should not
-PHD;Myb_DNA-binding;should not
-PHD;Alfin-like;should not
-PHD;ARID;should not
-PHD;DDT;should not
-PHD;Homeobox;should not
-PHD;JmjC;should not
-PHD;JmjN;should not
-PHD;PHD;should
-PHD;SWIB;should not
-PHD;zf-TAZ;should not
-PHD;zf-MIZ;should not
-PHD;zf-CCCH;should not
-PHD;HMG_box;should not
-PLATZ;PLATZ;should
-Pseudo ARR-B;CCT;should
-Pseudo ARR-B;Response_reg;should
-Pseudo ARR-B;tify;should not
-RB;RB_B;should
-Rcd1-like;Rcd1;should
-Rel;RHD_DNA_bind;should
-RF-X;RFX_DNA_binding;should
-RRN3;RRN3;should
-Runt;Runt;should
-S1Fa-like;S1FA;should
-SAP;STER_AP;should
-SBP;SBP;should
-SET;zf-C2H2;should not
-SET;TCR;should not
-SET;CXC;should not
-SET;PHD;should not
-SET;Myb_DNA-binding;should not
-SET;SET;should
-Sigma70-like;Sigma70_r2;should
-Sigma70-like;Sigma70_r3;should
-Sigma70-like;Sigma70_r4;should
-Sin3;PAH;should
-Sin3;WRKY;should not
-Sir2;SIR2;should
-SOH1;Med31;should
-SRS;DUF702;should
-SWI/SNF_BAF60b;SWIB;should
-SWI/SNF_SNF2;AP2;should not
-SWI/SNF_SNF2;PHD;should not
-SWI/SNF_SNF2;SNF2_N;should
-SWI/SNF_SNF2;zf-CCCH;should not
-SWI/SNF_SNF2;Myb_DNA-binding;should not
-SWI/SNF_SNF2;HMG_box;should not
-SWI/SNF_SWI3;SWIRM;should
-SWI/SNF_SWI3;Myb_DNA-binding;should
-TEA;TEA;should
-TFb2;Tfb2;should
-tify;tify;should
-TRAF;BTB;should
-TRAF;zf-TAZ;should not
-Trihelix;trihelix;should
-TUB;Tub;should
-ULT;ULT_Domain;should
-VARL;VARL;should
-VOZ;VOZ_Domain;should
-Whirly;Whirly;should
-WRKY;WRKY;should
-Zinc finger, AN1 and A20 type;zf-AN1;should
-Zinc finger, AN1 and A20 type;zf-C2H2;should not
-Zinc finger, MIZ type;zf-MIZ;should
-Zinc finger, MIZ type;zf-C2H2;should not
-Zinc finger, ZPR1;zf-ZPR1;should
-Zn_clus;Zn_clus;should
-ALOG;ALOG;should
-C2H2;C2H2-IDD;should not
-C2H2_IDD;C2H2-IDD;should
-C2H2_IDD;zf-C2H2;should
-MYST;zf-MYST;should
-CBP;CBP;should
-CBP;zf-TAZ;should
-CBP;BTB;should not
-TAFII250;DUF3591;should
-LOB1;bZIP_1;should not
-LOB1;bZIP_2;should not
-LOB1;DUF260;should
-LOB1;HLH;should not
-LOB1;Homeobox;should not
-LOB2;LOB2;should
-LDL/FLD;SWIRM;should
-LDL/FLD;Myb_DNA-binding;should not
-ADA2;zz-ADA2;should
-ADA2;Myb_DNA-binding;should
-RKD;RWP-RK;should
-RKD;NLP;should not
-NLP;RWP-RK;should
-NLP;NLP;should
-CRF;CRF;should
-CRF;AP2;should
-GIY_YIG;GIY_YIG;should
-ZPR;ZPR;should
-HD-LD;LD;should
-HD-NDX;NDX;should
-HD-SAWADEE;SAWADEE;should
-C1HDZ;C1HDZ;should
-C1HDZ;Homeobox;should
-C1HDZ;START;should not
-C1HDZ;MEKHLA;should not
-C2HDZ;C2HDZ;should
-C2HDZ;Homeobox;should
-C2HDZ;START;should not
-C2HDZ;MEKHLA;should not
b
diff -r 196795831b6a -r c4f865bd101a tapscan_rules_v82.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tapscan_rules_v82.txt Thu Feb 22 10:07:53 2024 +0000
b
@@ -0,0 +1,302 @@
+ABI3/VP1;AP2;should not
+ABI3/VP1;Auxin_resp;should not
+ABI3/VP1;B3;should
+ABI3/VP1;WRKY;should not
+Alfin-like;Alfin-like;should
+Alfin-like;Homeobox;should not
+Alfin-like;zf-TAZ;should not
+Alfin-like;PHD;should not
+AP2;AP2;should
+AP2;CRF;should not
+ARF;Auxin_resp;should
+Argonaute;Piwi;should
+Argonaute;PAZ;should
+ARID;ARID;should
+Aux/IAA;AUX_IAA;should
+Aux/IAA;Auxin_resp;should not
+Aux/IAA;B3;should not
+BBR/BPC;GAGA_bind;should
+BES1;BES1_N;should
+bHLH;HLH;should
+bHLH;TCP;should not
+bHLH_TCP;TCP;should
+bHSH;TF_AP-2;should
+BSD domain containing;BSD;should
+bZIP1;bZIP_1;should
+bZIP1;HLH;should not
+bZIP1;Homeobox;should not
+bZIP2;bZIP_2;should
+bZIP2;HLH;should not
+bZIP2;Homeobox;should not
+bZIPAUREO;bZIP_AUREO;should
+bZIPAUREO;HLH;should not
+bZIPAUREO;Homeobox;should not
+bZIPCDD;bZIP_CDD;should
+bZIPCDD;HLH;should not
+bZIPCDD;Homeobox;should not
+C2C2_CO-like;CCT;should
+C2C2_CO-like;GATA;should not
+C2C2_CO-like;tify;should not
+C2C2_CO-like;PLATZ;should not
+C2C2_CO-like;zf-B_box;should
+C2C2_Dof;zf-Dof;should
+C2C2_Dof;GATA;should not
+C2C2_GATA;GATA;should
+C2C2_GATA;tify;should not
+C2C2_GATA;zf-Dof;should not
+C2C2_YABBY;YABBY;should
+C2H2;zf-C2H2;should
+C2H2;zf-MIZ;should not
+C3H;AP2;should not
+C3H;SRF-TF;should not
+C3H;MYB-2R;should not
+C3H;MYB-3R;should not
+C3H;MYB-4R;should not
+C3H;zf-C2H2;should not
+C3H;zf-CCCH;should
+CAMTA;CG-1;should
+CAMTA;IQ;should
+NF-YA;bZIP_1;should not
+NF-YA;bZIP_2;should not
+NF-YA;CBFB_NFYA;should
+NF-YB;NF-YB;should
+NF-YB;NF-YC;should not
+NF-YC;NF-YB;should not
+NF-YC;NF-YC;should
+NF-YC;HMG_box;should not
+Coactivator p15;PC4;should
+CPP;TCR;should
+CSD;CSD;should
+CudA;STAT_bind;should
+CudA;SH2;should
+DBP;DNC;should
+DBP;PP2C;should
+DDT;DDT;should
+DDT;Homeobox;should not
+DDT;Alfin-like;should not
+Dicer;Piwi;should not
+Dicer;DEAD;should
+Dicer;Helicase_C;should
+Dicer;Ribonuclease_3;should
+Dicer;dsrm;should
+DUF246 domain containing/O-FucT;O-FucT;should
+DUF296 domain containing;DUF296;should
+DUF547 domain containing;DUF547;should
+DUF632 domain containing;DUF632;should
+DUF833 domain containing/TANGO2;TANGO2;should
+E2F/DP;E2F_TDP;should
+EIL;EIN3;should
+FHA;FHA;should
+GARP_ARR-B_G2;CCT;should not
+GARP_ARR-B_G2;G2-like_Domain;should
+GARP_ARR-B_G2;Response_reg;should
+GARP_ARR-B_Myb;CCT;should not
+GARP_ARR-B_Myb;Response_reg;should
+GARP_ARR-B_Myb;Myb_DNA-binding;should
+GARP_G2-like;G2-like_Domain;should
+GARP_G2-like;Response_reg;should not
+GARP_G2-like;Myb_DNA-binding;should not
+GeBP;DUF573;should
+GIF;SSXT;should
+GNAT;Acetyltransf_1;should
+GNAT;PHD;should not
+GRAS;GRAS;should
+GRF;QLQ;should
+GRF;WRC;should
+C3HDZ;Homeobox;should
+C3HDZ;START;should
+C3HDZ;MEKHLA;should
+C4HDZ;Homeobox;should
+C4HDZ;START;should
+C4HDZ;MEKHLA;should not
+HD_PLINC;ZF-HD_dimer;should
+HD_WOX;WOX_HD;should
+HD_DDT;Homeobox;should
+HD_DDT;DDT;should
+HD_DDT;WHIM1;should
+HD_DDT;WSD;should
+HD_PHD;PHD;should
+HD_PHD;Homeobox;should
+HD_PINTOX;Homeobox;should
+HD_PINTOX;PINTOX;should
+HD_TALE_BEL;Homeobox_KN;should
+HD_TALE_BEL;BEL;should
+HD_TALE_KNOX1;Homeobox_KN;should
+HD_TALE_KNOX1;KNOX1;should
+HD_TALE_KNOX1;KNOX2;should
+HD_TALE_KNOX1;KNOXC;should not
+HD_TALE_KNOX2;Homeobox_KN;should
+HD_TALE_KNOX2;KNOX1;should
+HD_TALE_KNOX2;KNOX2;should
+HD_TALE_KNOX2;KNOXC;should
+HD-other;EIN3;should not
+HD-other;Homeobox;should
+HD-other;bZIP_1;should not
+HD-other;WOX_HD;should not
+HD-other;PINTOX;should not
+HD-other;PHD;should not
+HD-other;BEL;should not
+HD_TALE;Homeobox_KN;should
+HD_TALE;BEL;should not
+HD_TALE;KNOX1;should not
+HD_TALE;KNOX2;should not
+HMG;ARID;should not
+HMG;HMG_box;should
+HMG;YABBY;should not
+HRT;HRT;should
+HSF;HSF_DNA-bind;should
+IWS1;Med26;should
+Jumonji_PKDM7;JmjC;should
+Jumonji_PKDM7;JmjN;should
+Jumonji_PKDM7;zf-C5HC2;should
+Jumonji_PKDM7;FYRN;should
+Jumonji_PKDM7;FYRC;should
+Jumonji_Other;JmjC;should
+LFY;FLO_LFY;should
+LIM;two_or_more_LIM;should
+LUG;LUFS_Domain;should
+MADS;SRF-TF;should
+MADS;K-box;should not
+MADS_MIKC;SRF-TF;should
+MADS_MIKC;K-box;should
+MBF1;MBF1;should
+Med6;Med6;should
+Med7;Med7;should
+mTERF;mTERF;should
+MYB-2R;G2-like_Domain;should not
+MYB-2R;Response_reg;should not
+MYB-2R;trihelix;should not
+MYB-2R;MYB-2R;should
+MYB-3R;G2-like_Domain;should not
+MYB-3R;Response_reg;should not
+MYB-3R;trihelix;should not
+MYB-3R;MYB-3R;should
+MYB-4R;G2-like_Domain;should not
+MYB-4R;Response_reg;should not
+MYB-4R;trihelix;should not
+MYB-4R;MYB-4R;should
+MYB-related;ARID;should not
+MYB-related;G2-like_Domain;should not
+MYB-related;Myb_DNA-binding;should
+MYB-related;Response_reg;should not
+MYB-related;trihelix;should not
+MYB-related;MYB-2R;should not
+MYB-related;MYB-3R;should not
+MYB-related;MYB-4R;should not
+NAC;NAM;should
+NZZ;NOZZLE;should
+OFP;Ovate;should
+PcG_EZ;CXC;should
+PcG_EZ;SET;should
+PcG_FIE;FIE_clipped_for_HMM;should
+PcG_FIE;WD40;should
+PcG_VEFS;VEFS-Box;should
+PcG_VEFS;zf-C2H2;should not
+PcG_MSI;WD40;should
+PcG_MSI;CAF1C_H4-bd;should
+PcG_MSI;FIE_clipped_for_HMM;should not
+PHD;Myb_DNA-binding;should not
+PHD;Alfin-like;should not
+PHD;ARID;should not
+PHD;DDT;should not
+PHD;Homeobox;should not
+PHD;JmjC;should not
+PHD;JmjN;should not
+PHD;PHD;should
+PHD;SWIB;should not
+PHD;zf-TAZ;should not
+PHD;zf-MIZ;should not
+PHD;zf-CCCH;should not
+PHD;HMG_box;should not
+PLATZ;PLATZ;should
+Pseudo ARR-B;CCT;should
+Pseudo ARR-B;Response_reg;should
+Pseudo ARR-B;tify;should not
+RB;RB_B;should
+Rcd1-like;Rcd1;should
+Rel;RHD_DNA_bind;should
+RF-X;RFX_DNA_binding;should
+RRN3;RRN3;should
+Runt;Runt;should
+S1Fa-like;S1FA;should
+SAP;STER_AP;should
+SBP;SBP;should
+SET;zf-C2H2;should not
+SET;TCR;should not
+SET;CXC;should not
+SET;PHD;should not
+SET;Myb_DNA-binding;should not
+SET;SET;should
+Sigma70-like;Sigma70_r2;should
+Sigma70-like;Sigma70_r3;should
+Sigma70-like;Sigma70_r4;should
+Sin3;PAH;should
+Sin3;WRKY;should not
+Sir2;SIR2;should
+SOH1;Med31;should
+SRS;DUF702;should
+SWI/SNF_BAF60b;SWIB;should
+SWI/SNF_SNF2;AP2;should not
+SWI/SNF_SNF2;PHD;should not
+SWI/SNF_SNF2;SNF2_N;should
+SWI/SNF_SNF2;zf-CCCH;should not
+SWI/SNF_SNF2;Myb_DNA-binding;should not
+SWI/SNF_SNF2;HMG_box;should not
+SWI/SNF_SWI3;SWIRM;should
+SWI/SNF_SWI3;Myb_DNA-binding;should
+TEA;TEA;should
+TFb2;Tfb2;should
+tify;tify;should
+TRAF;BTB;should
+TRAF;zf-TAZ;should not
+Trihelix;trihelix;should
+TUB;Tub;should
+ULT;ULT_Domain;should
+VARL;VARL;should
+VOZ;VOZ_Domain;should
+Whirly;Whirly;should
+WRKY;WRKY;should
+Zinc finger, AN1 and A20 type;zf-AN1;should
+Zinc finger, AN1 and A20 type;zf-C2H2;should not
+Zinc finger, MIZ type;zf-MIZ;should
+Zinc finger, MIZ type;zf-C2H2;should not
+Zinc finger, ZPR1;zf-ZPR1;should
+Zn_clus;Zn_clus;should
+ALOG;ALOG;should
+C2H2;C2H2-IDD;should not
+C2H2_IDD;C2H2-IDD;should
+C2H2_IDD;zf-C2H2;should
+MYST;zf-MYST;should
+CBP;CBP;should
+CBP;zf-TAZ;should
+CBP;BTB;should not
+TAFII250;DUF3591;should
+LOB1;bZIP_1;should not
+LOB1;bZIP_2;should not
+LOB1;DUF260;should
+LOB1;HLH;should not
+LOB1;Homeobox;should not
+LOB2;LOB2;should
+LDL/FLD;SWIRM;should
+LDL/FLD;Myb_DNA-binding;should not
+ADA2;zz-ADA2;should
+ADA2;Myb_DNA-binding;should
+RKD;RWP-RK;should
+RKD;NLP;should not
+NLP;RWP-RK;should
+NLP;NLP;should
+CRF;CRF;should
+CRF;AP2;should
+GIY_YIG;GIY_YIG;should
+ZPR;ZPR;should
+HD-LD;LD;should
+HD-NDX;NDX;should
+HD-SAWADEE;SAWADEE;should
+C1HDZ;C1HDZ;should
+C1HDZ;Homeobox;should
+C1HDZ;START;should not
+C1HDZ;MEKHLA;should not
+C2HDZ;C2HDZ;should
+C2HDZ;Homeobox;should
+C2HDZ;START;should not
+C2HDZ;MEKHLA;should not
b
diff -r 196795831b6a -r c4f865bd101a tapscan_script_v74.pl
--- a/tapscan_script_v74.pl Wed Feb 14 13:54:16 2024 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
b'@@ -1,916 +0,0 @@\n-#!/usr/bin/perl\n-use strict;\n-use warnings;\n-\n-# Written by Gerrit Timmerhaus (gerrit.timmerhaus@biologie.uni-freiburg.de).\n-# Changes included by Kristian Ullrich, Per Wilhelmsson and Romy Petroll.\n-\n-# Script to extract all detected domains out of a hmmsearch results file and classify the families of all used proteins based on these domains.\n-# The classification depends on a table which contains all known classification rules for the protein families of interest and on specific coverage values defined for every domain.\n-# The script provides three outputs, namely output.1, output.2 and output.3. The output files are tables in ";"-delimited format.\n-# The structure of output.1 is: "sequence ID ; TAP family ; number of classifications ; domains". \n-# Output.3 shares in principle the same structure as output.1, except that subfamilies are considered. ("sequence ID ; TAP family ; Subfamily ; number of classifications ; domains")\n-# The superior TAP family is specified first, followed by the subfamily. If a TAP family has no subfamily, the TAP family is specified first and then a "-". \n-# The structure of output.2 is: "TAP family";"number of detected proteins".\n-# More than one entry for a protein is possible because the classification rules may allow more than one classification.\n-#\n-# The script must be startet with the arguments <hmmsearch output file> <classification rules> <output classifications file> <output family statistics file> <output subfamily classifications file> <"filter" if desired>\n-\n-if (!@ARGV or ($ARGV [0] eq "-h") or ($ARGV [0] eq "-help")) {\n-\tprint "Usage: extract.and.classify.pl <hmmsearch output file> <classification rules> <output classifications file> <output family statistics file> <output subfamily classifications file> <\\"filter\\" (if desired)>\\n\\n";\n-\texit;\n-}\n-\n-# hmmsearch_output: domtblout file\n-my $hmmsearch_output = $ARGV [0];\n-# decision_table: rules file\n-my $decision_table = $ARGV [1];\n-# family_classifications: output.1\n-my $family_classifications = $ARGV [2]; \n-# family_statistics: output.2\n-my $family_statistics = $ARGV [3];\n-# subfamily_classifications: output.3\n-my $subfamily_classifications = $ARGV [4];\n-# domspec_cuts: coverage values file\n-my $domspec_cuts = $ARGV [5];\n-# gene_model_filte: filter for ARATH and ORYSA\n-my $gene_model_filter = $ARGV [6];\n-\n-if ($family_statistics eq "") {\n-\tprint "Usage: extract.and.classify.pl <hmmsearch output file> <classification rules> <output classifications file> <output family statistics file> <output subfamil classifications file> <\\"filter\\" (if desired)>\\n\\n";\n-\texit;\n-}\n-\n-if ($gene_model_filter and $gene_model_filter eq "filter") {\n-\tprint "\\nGene model filter is activated. It only works for TAIR (Arabidopsis) and TIGR (Rice) proteins up to now\\n";\n-}\n-\n-# Array where the $hmmsearch-output/domtblout file will be stored\n-my @output = ();\n-# Array with domain-specific coverage values\n-my @cuts = ();\n-# Array with rules\n-my @dec_table = ();\n-# Counter for the number of detected domains in the hmmsearch output file\n-my $entry_counter = 0;\n-# Containes the actual result for a query sequence\n-my $akt_entry = "";\n-# Used to define query entry to ignore similar domains\n-my $whole_entry = ""; \n-# Includes the final entries after ignoring similar domains\n-my @results_of_extraction = ();\n-# Used to define query entry to ignore similar domains\n-my $extracted_domain = "";\n-# Used to define query entry to ignore similar domains\n-my $present = "";\n-# Used to define query entry to ignore similar domains\n-my $protein = ""; \n-\n-my $lek = "";\n-\n-############################################\n-### 1. Read in the hmmsearch output file ###\n-############################################\n-\n-print "\\n*** reading in $hmmsearch_output ***\\n\\n";\n-\n-@output = get_file_data("$hmmsearch_output");\n-\n-print "*** Parsing $hmmsearch_output ***\\n\\n";\n-\n-# If wrong format exit the program, ninth row from the end\n-if ($output [-9] !~ /^# Program:         hmms'..b'ilies into $family_list to create output.2\n-foreach my $fcf_line (@family_classifications_file) {\n-\t$fcf_line =~ /^[^;]+;([^;]+)/;\n-\t#push @family_list, "$1";\n-\tprint FAMILY_CLASSIFICATIONS "$fcf_line";\n-}\n-close (FAMILY_CLASSIFICATIONS);\n-\n-foreach my $fcf_line (@subfamily_classifications_file) {\n-\t$fcf_line =~ /^[^;]+;([^;]+);([^;]+)/;\n-\tif ($2 eq "-") {\n-\t\tpush @family_list, "$1";\n-\t}\n-\telse {\n-\t\tpush @family_list, "$2";\n-\t}\n-\tprint SUBFAMILY_CLASSIFICATIONS "$fcf_line";\n-} \n-\n-close (SUBFAMILY_CLASSIFICATIONS);\n-\n-print "*** calculating the family statistics and write it in $family_statistics ***\\n\\n";\n-\n-##################################\n-### 5. Create the output files ###\n-##################################\n-\n-my $statistics_outputfile = "$family_statistics";\n-\n-unless (open(FAMILY_STATISTICS, ">$statistics_outputfile")) {\n-\tprint "Cannot open file \\"$statistics_outputfile\\" to write to!!\\n\\n";\n-\texit;\n-}\n-\n-# Count the family entries\n-my @output_family_statistics = ();\n-my @gefundene_familien = ();\n-my $family_counter = 1;\n-\n-shift @family_list;\n-@family_list = sort @family_list;\n-\n-\n-my $old_family = "";\n-push @family_list, \'BAD FIX\'; # Makes to loop go through every fam and stops at non fam(BAD FIX).\n-\n-foreach my $line (@family_list) {\n-\tif ($line eq $old_family) {\n-\t\t$family_counter++;\n-\t}\n-\telsif ($old_family ne "") {\n-\t\tpush (@output_family_statistics,"$old_family;$family_counter\\n");\n-\t\t# Add all found families to a list of found families\n-\t\tpush (@gefundene_familien,"$old_family");\n-\t        $family_counter=1;\n-\t}\n-\t$old_family = $line;\n-}\n-\n-my %hash = ();\n-\n-# Put all families from the classifictaion ruled and all found families in a hash\n-foreach my $element (@gefundene_familien,@liste_alle_familien) {$hash{$element}++;}\n-\n-# Remove merged families\n-delete $hash{\'GARP_ARR-B_Myb\'};\n-delete $hash{\'GARP_ARR-B_G2\'};\n-delete $hash{\'bZIP1\'};\n-delete $hash{\'bZIP2\'};\n-delete $hash{\'bZIPAUREO\'};\n-delete $hash{\'bZIPCDD\'};\n-delete $hash{\'HRT\'};\n-delete $hash{\'GIY_YIG\'};\n-\n-# Add all not found families from the classification rules to @output_family_statistics \n-# With zero as number of families found \n-\n-foreach my $element (keys %hash) {\n-\tif (($hash{$element} == 1) and ( $element eq "0_no_family_found")) {\n-\tpush (@output_family_statistics,"$element;$unclassified_families\\n");\n-\t}\n-\tif (($hash{$element} == 1) and ($element ne "0_no_family_found")) {\n-\tpush (@output_family_statistics,"$element;0\\n");\n-\t}\n-}\n-\n-# Sort @output_family_statistics caseinsensitive-alphabetically \n-my @sortierte_statistik = sort {lc $a cmp lc $b} @output_family_statistics;\n-\n-# Print headline to @sortierte_statistik\n-unshift (@sortierte_statistik,"family statistics for $hmmsearch_output\\n");\n-\n-print FAMILY_STATISTICS @sortierte_statistik;\n-\n-# Print FAMILY_STATISTICS "$old_family;$family_counter\\n";\n-\n-close (FAMILY_STATISTICS);\n-\n-#################################################\n-### 6. Give out some statistical informations ###\n-#################################################\n-\n-$entry_counter [0] = 0;\n-my $sum = 0;\n-foreach my $entry (@entry_counter) {\n-\t$sum += $entry;\n-}\n-print "$classified_families classifications were found for $sum proteins.\\n";\n-print "This classifications are divided in:\\n";\n-my $count = 0;\n-foreach my $element (@entry_counter) {\n-\tif ($count != 0) {\n-\t\tprint "$element proteins were classified for $count";\n-\t\tif ($count == 1) {print " family\\n";}\n-\t\telse {print " different families\\n";}\n-\t}\n-\t$count++;\n-}\n-print "\\n$unclassified_families proteins could not be classified\\n\\n";\n-\n-print "*** The results were written in $family_classifications and $subfamily_classifications ***\\n";\n-print "*** done ***\\n\\n";\n-\n-exit;\n-\n-sub get_file_data {\n-\t\n-\tmy ($filename) = @_;\n-\n-\tuse strict;\n-\tuse warnings;\n-\n-\tmy @filedata = ();\n-\n-\tunless( open(GET_FILE_DATA, $filename)) {\n-\t\tprint STDERR "Cannot open file \\"$filename\\"n\\n";\n-\t\texit;\n-\t}\n-\n-\t@filedata = <GET_FILE_DATA>;\n-\n-\tclose GET_FILE_DATA;\n-\n-\treturn @filedata;\n-}\n-\n'
b
diff -r 196795831b6a -r c4f865bd101a test-data/output.2.tsv
--- a/test-data/output.2.tsv Wed Feb 14 13:54:16 2024 +0000
+++ b/test-data/output.2.tsv Thu Feb 22 10:07:53 2024 +0000
b
@@ -57,13 +57,14 @@
 HD-NDX 0
 HD-other 0
 HD-SAWADEE 0
-HD_BEL 0
 HD_DDT 0
-HD_KNOX1 0
-HD_KNOX2 0
 HD_PHD 0
 HD_PINTOX 0
 HD_PLINC 0
+HD_TALE 0
+HD_TALE_BEL 0
+HD_TALE_KNOX1 0
+HD_TALE_KNOX2 0
 HD_WOX 0
 HMG 0
 HSF 0
b
diff -r 196795831b6a -r c4f865bd101a test-data/output.domtbl.tsv
--- a/test-data/output.domtbl.tsv Wed Feb 14 13:54:16 2024 +0000
+++ b/test-data/output.domtbl.tsv Thu Feb 22 10:07:53 2024 +0000
[
@@ -20,9 +20,9 @@
 # Program:         hmmsearch
 # Version:         3.3.2 (Nov 2020)
 # Pipeline mode:   SEARCH
-# Query file:      /home/saskia/code/github/galaxyproject/tools-iuc/tools/tapscan/tapscan_domains_v12.txt
-# Target file:     /tmp/saskia/tmp4n93kzh1/files/b/7/b/dataset_b7b39fd6-41f1-440d-bfb7-da7a3a9f070e.dat
-# Option settings: hmmsearch --domtblout domtblout.txt --cut_ga /home/saskia/code/github/galaxyproject/tools-iuc/tools/tapscan/tapscan_domains_v12.txt /tmp/saskia/tmp4n93kzh1/files/b/7/b/dataset_b7b39fd6-41f1-440d-bfb7-da7a3a9f070e.dat 
-# Current dir:     /tmp/saskia/tmp4n93kzh1/job_working_directory/000/4/working
-# Date:            Mon Nov 13 14:57:28 2023
+# Query file:      /home/saskia/code/github/bgruening/galaxytools/tools/tapscan/tapscan_domains_v13.txt.gz
+# Target file:     /tmp/saskia/tmpwta2cyq_/files/f/5/c/dataset_f5c238a3-8cc8-42f9-86b5-ebc2431be570.dat
+# Option settings: hmmsearch --domtblout domtblout.txt --cut_ga /home/saskia/code/github/bgruening/galaxytools/tools/tapscan/tapscan_domains_v13.txt.gz /tmp/saskia/tmpwta2cyq_/files/f/5/c/dataset_f5c238a3-8cc8-42f9-86b5-ebc2431be570.dat 
+# Current dir:     /tmp/saskia/tmpwta2cyq_/job_working_directory/000/4/working
+# Date:            Thu Feb 22 10:10:40 2024
 # [ok]