Mercurial > repos > bgruening > tapscan

--- a/tapscan.xml	Wed Feb 14 13:54:16 2024 +0000
+++ b/tapscan.xml	Thu Feb 22 10:07:53 2024 +0000
@@ -1,4 +1,4 @@
-<tool id="tapscan_classify" name="TAPScan Classify" version="4.74+galaxy0" profile="23.0">
+<tool id="tapscan_classify" name="TAPScan Classify" version="4.76+galaxy0" profile="23.0">
     <description>Detect Transcription Associated Proteins (TAPs)</description>
     <edam_topics>
         <edam_topic>topic_0121</edam_topic>
@@ -9,28 +9,28 @@
         <requirement type="package" version="4.8">sed</requirement>
     </requirements>
     <required_files>
-        <include type="literal" path="tapscan_script_v74.pl"/>
-        <include type="literal" path="tapscan_domains_v12.txt"/>
-        <include type="literal" path="tapscan_rules_v81.txt"/>
-        <include type="literal" path="tapscan_coverage_values_v10.txt"/>
+        <include type="literal" path="tapscan_classify.pl"/>
+        <include type="literal" path="tapscan_domains_v13.txt.gz"/>
+        <include type="literal" path="tapscan_rules_v82.txt"/>
+        <include type="literal" path="tapscan_coverage_values_v11.txt"/>
     </required_files>
     <command detect_errors="aggressive"><![CDATA[

 hmmsearch
   --domtblout domtblout.txt
   --cut_ga
-  '${__tool_directory__}/tapscan_domains_v12.txt.gz'
+  '${__tool_directory__}/tapscan_domains_v13.txt.gz'
   '$protein_fasta_in'

 &&

-perl '${__tool_directory__}/tapscan_script_v74.pl'
+perl '${__tool_directory__}/tapscan_classify.pl'
   domtblout.txt
-  '${__tool_directory__}/tapscan_rules_v81.txt'
+  '${__tool_directory__}/tapscan_rules_v82.txt'
   '$taps_detected'
   '$taps_family_counts'
   '$taps_detected_extra'
-  '${__tool_directory__}/tapscan_coverage_values_v10.txt'
+  '${__tool_directory__}/tapscan_coverage_values_v11.txt'

 &&
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tapscan_classify.pl	Thu Feb 22 10:07:53 2024 +0000
@@ -0,0 +1,932 @@
+#!/usr/bin/perl
+use strict;
+use warnings;
+use File::Basename;
+
+my $tapscan_version = "v4.76";
+print "Running TAPscan Classify version $tapscan_version \n\n";
+
+
+# Written by Gerrit Timmerhaus (gerrit.timmerhaus@biologie.uni-freiburg.de).
+# Changes included by Kristian Ullrich, Per Wilhelmsson, Romy Petroll and Saskia Hiltemann.
+
+# Script to extract all detected domains out of a hmmsearch results file and classify the families of all used proteins based on these domains.
+# The classification depends on a table which contains all known classification rules for the protein families of interest and on specific coverage values defined for every domain.
+# The script provides three outputs, namely output.1, output.2 and output.3. The output files are tables in ";"-delimited format.
+# The structure of output.1 is: "sequence ID ; TAP family ; number of classifications ; domains".
+# Output.3 shares in principle the same structure as output.1, except that subfamilies are considered. ("sequence ID ; TAP family ; Subfamily ; number of classifications ; domains")
+# The superior TAP family is specified first, followed by the subfamily. If a TAP family has no subfamily, the TAP family is specified first and then a "-".
+# The structure of output.2 is: "TAP family";"number of detected proteins".
+# More than one entry for a protein is possible because the classification rules may allow more than one classification.
+#
+# The script must be startet with the arguments <hmmsearch output file> <classification rules> <output classifications file> <output family statistics file> <output subfamily classifications file> <"filter" if desired>
+
+if (!@ARGV or ($ARGV [0] eq "-h") or ($ARGV [0] eq "-help")) {
+	print "Usage: extract.and.classify.pl <hmmsearch output file> <classification rules> <output classifications file> <output family statistics file> <output subfamily classifications file> <\"filter\" (if desired)>\n\n";
+	exit;
+}
+
+# hmmsearch_output: domtblout file
+my $hmmsearch_output = $ARGV [0];
+# decision_table: rules file
+my $decision_table = $ARGV [1];
+# family_classifications: output.1
+my $family_classifications = $ARGV [2];
+# family_statistics: output.2
+my $family_statistics = $ARGV [3];
+# subfamily_classifications: output.3
+my $subfamily_classifications = $ARGV [4];
+# domspec_cuts: coverage values file
+my $domspec_cuts = $ARGV [5];
+# gene_model_filte: filter for ARATH and ORYSA
+my $gene_model_filter = $ARGV [6];
+
+# get basename for output files
+my($basename, $dirs, $suffix) = fileparse($hmmsearch_output, qr/\.[^.]*/);
+
+if ($family_statistics eq "") {
+	print "Usage: extract.and.classify.pl <hmmsearch output file> <classification rules> <output classifications file> <output family statistics file> <output subfamil classifications file> <\"filter\" (if desired)>\n\n";
+	exit;
+}
+
+if ($gene_model_filter and $gene_model_filter eq "filter") {
+	print "\nGene model filter is activated. It only works for TAIR (Arabidopsis) and TIGR (Rice) proteins up to now\n";
+}
+
+# Array where the $hmmsearch-output/domtblout file will be stored
+my @output = ();
+# Array with domain-specific coverage values
+my @cuts = ();
+# Array with rules
+my @dec_table = ();
+# Counter for the number of detected domains in the hmmsearch output file
+my $entry_counter = 0;
+# Containes the actual result for a query sequence
+my $akt_entry = "";
+# Used to define query entry to ignore similar domains
+my $whole_entry = "";
+# Includes the final entries after ignoring similar domains
+my @results_of_extraction = ();
+# Used to define query entry to ignore similar domains
+my $extracted_domain = "";
+# Used to define query entry to ignore similar domains
+my $present = "";
+# Used to define query entry to ignore similar domains
+my $protein = "";
+
+my $lek = "";
+
+############################################
+### 1. Read in the hmmsearch output file ###
+############################################
+
+print "\n*** reading in $hmmsearch_output ***\n\n";
+
+@output = get_file_data("$hmmsearch_output");
+
+print "*** Parsing $hmmsearch_output ***\n\n";
+
+# If wrong format exit the program, ninth row from the end
+if ($output [-9] !~ /^# Program:         hmmsearch.*/) {
+	print "wrong file format. $hmmsearch_output has to be a hmmsearch output file\n\n";
+	exit;
+}
+
+### 1.1 Trim input file (hmmsearch) to fit script. Erase lines starting with #.
+@output = grep !/^#.*/, @output;
+
+# Read domain-specific coverage values file
+# domname	cut-off
+# EIN3		0.5874125874
+
+open(FILENAME,$domspec_cuts);
+my %cuts = map { chomp; split /\t/ } <FILENAME>;
+
+### 1.2 Use first (prot name), third (domain name), eight (dom evalue), fourteenth (# overlapping) and fifteenth (# domain) column of data input(array element).
+foreach my $output (@output) {
+	$output =~ /^(\S+)\s+\S+\s+\S+\s+(\S+)\s+\S+\s+(\S+)\s+\S+\s+\S+\s+\S+\s+\S+\s+\S+\s+(\S+)\s+\S+\s+\S+\s+\S+\s+(\S+)\s+(\S+)\s+.*/;
+	$lek = (($6-$5)+1)/$3;
+	$output = $1."\t".$2."\t".$4."\t".$lek;
+	}
+
+# Stucture:
+# ARATHwo_AT2G03430.1	Ank	1.8e-08		0.93
+# ARATHwo_AT2G03430.1	Ank     7.8e-10		0.95
+# ARATHwo_AT2G03430.1	Ank     1.2e-08		0.93
+# ARATHwo_AT2G03430.1	Ank     1.2e-08		0.63
+
+################################
+### 2. Modify the input data ###
+################################
+
+### 2.1 Run through length_cut_off loop (throws out entries below coverage values)
+my @cutarray;
+open(FH, '>', "${basename}_filtered_sequences.txt") or die $!;
+print FH "Below are the sequences that were filtered out, along with the reason";
+foreach my $output (@output) {
+	$output =~ /^(\S+)\t+(\S+)\t(\S+)\t(\S+)/;
+	if ($4 > $cuts{$2}) {
+	push (@cutarray, $output);
+	}
+    else{
+        print FH "$output --> did not meet coverage cutoff of $cuts{$2} (cov was $4)\n"
+    }
+}
+close(FH);
+
+# End up with (prot name) (domain name) (dom evalue) (overlapping)
+
+# Structure:
+# ARATHwo_AT2G03430.1	Ank	1.8e-08		0.93
+# ARATHwo_AT2G03430.1	Ank     7.8e-10		0.95
+# ARATHwo_AT2G03430.1	Ank     1.2e-08		0.93
+
+
+### 2.2 Retain hit with the lowest evalue
+
+# String for the previous protein name
+my $old_prot = "";
+# Coordinates for array filling
+my $x = -1;
+my $y = 1;
+my $scoreeval;
+my @newarray;
+foreach my $cutarray (@cutarray) {
+	$cutarray =~ /^(\S+)\t+(\S+)\t(\S+)\t(\S+)/;
+	$akt_entry = $1.$2;
+	# If the protein contains more than one of the same domain
+	if ($old_prot eq $akt_entry) {
+		if ($3>$scoreeval) {
+			$y++;
+			$newarray[$x] = $1."\t".$2."\t".$3."\t".$y;
+			}
+		else {
+			$y++;
+			$newarray[$x] = $1."\t".$2."\t".$scoreeval."\t".$y;
+			}
+		}
+	# If the protein is new reset $j and push the entry in the next array
+	else {
+		$y = 1;
+		$x++;
+		$newarray[$x] = $1."\t".$2."\t".$3."\t".$y;
+	}
+	$old_prot = $akt_entry;
+	$scoreeval = $3
+
+}
+# Final structure:
+# ARATHwo_AT2G03430.1	Ank     7.8e-10		3
+
+### 2.3 Then sort @output primarily on prot name($1) and then secondarily on eval($3) to fit the "similar"-procedure.
+@newarray = sort { (split '\t', $a)[0] cmp (split '\t', $b)[0] || (split '\t', $a)[2] <=> (split '\t', $b)[2] } @newarray;
+
+# Flags for merged domains
+my $ARR_B_counter = 0;
+my $bZIP_counter = 0;
+my $ET_counter = 0;
+
+# Flags for similar domain omission
+my $WD40domain = 0;		# modified for FIE rule
+my $FIEdomain = 0;		# --------'''''--------
+my $MYBdomain = 0;
+my $G2domain = 0;
+my $YBdomain = 0;
+my $YCdomain = 0;
+my $PHDdomain = 0;
+my $Alfindomain = 0;
+my $Dr1domain = 0;
+my $GATAdomain = 0;
+my $Dofdomain = 0;
+my $C1domain = 0;
+
+foreach my $line (@newarray) {
+
+# Reset the flags for domain omission
+	if ($line !~ /^$protein.*$/ ) {
+		$WD40domain = 0;
+		$FIEdomain = 0;
+		$MYBdomain = 0;
+		$G2domain = 0;
+		$YBdomain = 0;
+		$YCdomain = 0;
+		$PHDdomain = 0;
+		$Alfindomain = 0;
+		$Dr1domain = 0;
+		$GATAdomain = 0;
+		$Dofdomain = 0;
+		$C1domain = 0;
+
+	}
+
+	# Get the Query entry
+	$line =~ /^(\S+)\s+(\S+)\s+(\S+)\s+(\S+)/;
+	$protein = "$1";
+	$extracted_domain = $2;
+	$present = "$4"; # FIX! number for many Myb
+	$whole_entry = $1.";".$2.";".$3.";".$4;
+
+
+	# Ignore FIE if it is not better scored than WD40
+	if ($extracted_domain eq "WD40" and $FIEdomain == 0) {$WD40domain = 1;}
+
+	# Ignore similar domains part 1
+
+	if ($extracted_domain eq "Myb_DNA-binding" and $G2domain == 0) {$MYBdomain = 1;}
+	if ($extracted_domain eq "G2-like_Domain" and $MYBdomain == 0) {$G2domain = 1;}
+
+	if ($extracted_domain eq "PHD" and $Alfindomain == 0 and $C1domain == 0) {$PHDdomain = 1;}
+	if ($extracted_domain eq "Alfin-like" and $PHDdomain == 0 and $C1domain == 0) {$Alfindomain = 1;}
+	if ($extracted_domain eq "C1_2" and $Alfindomain == 0 and $PHDdomain == 0) {$C1domain = 1;}
+
+        if ($extracted_domain eq "GATA" and $Dofdomain == 0) {$GATAdomain = 1;}
+	if ($extracted_domain eq "zf-Dof" and $GATAdomain == 0) {$Dofdomain = 1;}
+
+	# Ignore similar domains part 2
+
+	if ($extracted_domain eq "FIE_clipped_for_HMM" and $WD40domain == 1) {next;}
+	if ($extracted_domain eq "Myb_DNA-binding" and $G2domain == 1) {next;}
+	if ($extracted_domain eq "G2-like_Domain" and $MYBdomain == 1) {next;}
+
+	if ($extracted_domain eq "PHD" and ($Alfindomain == 1 or $C1domain == 1)) {next;}
+	if ($extracted_domain eq "Alfin-like" and ($PHDdomain == 1 or $C1domain ==1)) {next;}
+	if ($extracted_domain eq "C1_2" and ($PHDdomain == 1 or $Alfindomain == 1)) {next;}
+
+       	if ($extracted_domain eq "GATA" and $Dofdomain == 1) {next;}
+	if ($extracted_domain eq "zf-Dof" and $GATAdomain == 1) {next;}
+
+	# Save the entry
+	push @results_of_extraction, $whole_entry;
+	$entry_counter++;
+}
+print "$entry_counter domain matches were found in $hmmsearch_output\n\n";
+
+# Sort the entries:
+my @sorted_results_of_extraction = @results_of_extraction;
+print "*** Preparing hmmsearch results for classification ***\n\n";
+
+# This array will be filled with the sorted proteins
+my $array_of_arrays = [];
+
+# String for the previous protein name
+my $old_entry = "";
+
+# Coordinates for array filling
+my $i = -1;
+my $j = 0;
+
+@dec_table = get_file_data("$decision_table");
+
+if ($dec_table [0] !~ /^[^;]+;[^;]+;[^;]/) {
+	print "wrong file format. $decision_table has to be in the format family;domain;type\n\n";
+	exit;
+}
+
+# Formate the entries
+foreach my $domain_entry (@sorted_results_of_extraction) {
+
+	$domain_entry =~ /^([^;]+);/;
+	$akt_entry = $1;
+	# If the protein has more than one domain push it behind the old entry
+	if ($old_entry eq $akt_entry) {
+		$j++;
+		$array_of_arrays->[$i][$j] = $domain_entry;
+	}
+
+	# If the protein is new reset $j and push the entry in the next array
+	else {
+		$j = 0;
+		$i++;
+		$array_of_arrays->[$i][$j] = $domain_entry;
+	}
+	$old_entry = $akt_entry;
+
+}
+$old_entry = "";
+
+# Remember the number of entries to use them later
+my $number_of_proteins = $i+1;
+print "$number_of_proteins proteins were found in $hmmsearch_output\n";
+$number_of_proteins--;
+
+#########################################
+### 3. Get the classification entries ###
+#########################################
+
+# Make a list of all families in the classification rules file
+my @liste_alle_familien = ('0_no_family_found');
+
+my $ARR_B_in_list = 0;
+my $bZIP_in_list = 0;
+my $ET_in_list = 0;
+
+my $array_of_classifications = [];
+$i = -1;
+$j = 0;
+
+my @classifications_input = get_file_data("$decision_table");
+
+# Sort the classifications
+my @classifications = sort @classifications_input;
+
+
+foreach my $classification (@classifications) {
+	$classification =~ /^([^;]+);/;
+	$akt_entry = $1;
+
+	# If the family has more than one domain entry push it behind the old entry
+	if ($old_entry eq $akt_entry) {
+
+		$j++;
+		$array_of_classifications->[$i][$j] = $classification;
+	}
+
+	# If the family is new push it in the next array and reset $j
+	else {
+		$j = 0;
+		$i++;
+		$array_of_classifications->[$i][$j] = $classification;
+		# Add family to list of all families in the classification rules
+		push(@liste_alle_familien,$akt_entry);
+
+		# Add hardcoded "OR" defined families to list of families if subfamily is present
+		# Do this just once for each family
+		if (($akt_entry eq "GARP_ARR-B_Myb") or ($akt_entry eq "GARP_ARR-B_G2")) {
+                	if ($ARR_B_in_list == 0) {
+				push(@liste_alle_familien,"GARP_ARR-B");
+				$ARR_B_in_list++;
+			}}
+		if (($akt_entry eq "bZIP1") or ($akt_entry eq "bZIP2") or ($akt_entry eq "bZIPAUREO") or ($akt_entry eq "bZIPCDD")) {
+                	if ($bZIP_in_list == 0) {
+				push(@liste_alle_familien,"bZIP");
+				$bZIP_in_list++;
+			}}
+		if (($akt_entry eq "HRT") or ($akt_entry eq "GIY_YIG")) {
+                	if ($ET_in_list == 0) {
+				push(@liste_alle_familien,"ET");
+				$ET_in_list++;
+			}}
+	}
+	$old_entry = $akt_entry;
+}
+
+my $number_of_classifications = $i+1;
+print "$number_of_classifications different families were found in $decision_table\n\n";
+$number_of_classifications--;
+
+print "*** begin classification ***\n\n";
+
+# Create the output files (output.1 and output.3)
+my $outputfile = "$family_classifications";
+my $subfamilyoutput = "$subfamily_classifications";
+
+unless (open(FAMILY_CLASSIFICATIONS, ">$outputfile")) {
+	print "Cannot open file \"$outputfile\" to write to!!\n\n";
+	exit;
+}
+
+unless (open(SUBFAMILY_CLASSIFICATIONS, ">$subfamilyoutput")) {
+	print "Cannot open file \"$subfamilyoutput\" to write to!!\n\n";
+	exit;
+}
+
+print FAMILY_CLASSIFICATIONS "family classifications for $hmmsearch_output\n";
+print SUBFAMILY_CLASSIFICATIONS "family classifications for $hmmsearch_output\n";
+
+# Counter for classified families
+my $classified_families = 0;
+# Counter for unclassified families
+my $unclassified_families = 0;
+# Counter for the different amount of entries in classification possibilities
+my @entry_counter = [];
+# Counter for the possible classifications for the current protein
+my $possibilities = 0;
+# For later famliy statistics
+my @family_list = [];
+# This array will be filled with the result. Later written in output.1 and output.3
+my @family_classifications_file = [];
+my @subfamily_classifications_file = [];
+
+# One time for every protein in array of arrays
+for (my $ii = 0; $ii <= $number_of_proteins; $ii++) {
+	my $jj = 0;
+	$possibilities = 0;
+
+    # Flag to mark that any family was found for the current protein
+	my $member_found = 0;
+
+	# In this string all the domains of one protein will be stored
+	# The ; signs are important to mark a single entry (for example to differ AP2 and TF_AP2)
+	my $domains = ";";
+
+	# Get the domains in all elements of the current protein array:
+	while ($array_of_arrays->[$ii][$jj]) {
+
+		$array_of_arrays->[$ii][$jj] =~ /([^;]+);[^;]+;(\d+)$/;
+		# For discrimination between MYB and MYB-related. If more than 1 MYB domains was found replaced "Myb_DNA-binding" with "two_or_more_Myb_DNA-binding"
+		# Same for LIM
+		if ($1 eq "Myb_DNA-binding" and $2 eq 2) {
+			$domains .= "MYB-2R;Myb_DNA-binding;";
+		}
+		elsif ($1 eq "Myb_DNA-binding" and $2 eq 3) {
+			$domains .= "MYB-3R;Myb_DNA-binding;";
+		}
+		elsif ($1 eq "Myb_DNA-binding" and $2 eq 4) {
+			$domains .= "MYB-4R;Myb_DNA-binding;";
+		}
+		elsif ($1 eq "LIM" and $2 > 1) {
+			$domains .= "two_or_more_LIM;LIM;";
+		}
+		else {
+			$domains .= "$1;";
+		}
+		$jj++;
+	}
+
+#########################
+### 4. Classification ###
+#########################
+
+# Get the name of the actual protein for the output after classification:
+$array_of_arrays->[$ii][0] =~ /^([^;]+);/;
+$akt_entry = $1;
+
+# Variables for double classification decisions:
+
+# Here the shoulds for the actual family will be stored
+my $shoulds_act_family = ";";
+# Here the shoulds of the previous CLASSIFIED family will be stored
+my $shoulds_prev_family = "";
+
+# This loop will be made for every family entry in output.1 and output.3
+for ($i = 0;$i <= $number_of_classifications; $i++) {
+	$j = 0;
+
+	# Flag to mark if the protein is a possible member of the current family
+	my $possible_member = 0;
+
+	# Reset the domain entries for the current family
+        $shoulds_act_family = ";";
+
+        # This loop will be made for every domain classification entry for the current family
+	while ($array_of_classifications->[$i][$j]) {
+
+		# For "should" entries
+		if ($array_of_classifications->[$i][$j] =~ /;should$/) {
+
+			# Get the domain from the classification entry
+			$array_of_classifications->[$i][$j] =~ /^[^;]+;([^;]+);[^;]+/;
+
+			my $domain_classification_entry = $1;
+			# Check if the protein might be a member of the current family. If yes check the next entry
+			if ($domains =~ /;$domain_classification_entry;/) {
+				$possible_member = 1;
+				$j++;
+				# All matched domains for the current family are stored here:
+				$shoulds_act_family .= "$domain_classification_entry;";
+				next;
+			}
+			$possible_member = 0;
+			$j++;
+			last;
+		}
+
+		# For "should not" entries
+		if ($array_of_classifications->[$i][$j] =~ /;should not$/) {
+
+			# Get the domain from the classification entry
+			$array_of_classifications->[$i][$j] =~ /^[^;]+;([^;]+);[^;]+/;
+
+			# Check if the protein has the domain. If yes go to the next family
+			my $domain_check = $1;
+			if ($domains =~ /;$domain_check;/) {
+					$possible_member = 0;
+					last;
+			}
+			# If not check the next entry
+			else {
+					$j++;
+			    		next;
+			}
+		}
+
+			# This happens when there is no "should" or "should not" entry at the right position
+			print "error in classification entry $array_of_classifications->[$i][$j]\n"
+	}
+
+	# If the check was successful print the found classification
+	if ($possible_member) {
+
+		$array_of_classifications->[$i][0] =~ /^([^;]+);/;
+		my $currentFamily = $1;
+		$possibilities++;
+		# The double classified proteins solution:
+		if ($possibilities > 1) {
+
+			# Fill an array with the current domains in $domains
+			my @domains_array = [];
+			my $number_of_domains = ($domains =~ tr/;/;/)-1;
+			my $tempdomains = $domains;
+			for (my $domain_counter = 0;$domain_counter != $number_of_domains;$domain_counter++) {
+
+				$tempdomains =~ /^;([^;]+);/;
+				$domains_array[$domain_counter] = $1;
+				# Remove the inserted domain from domains
+				$tempdomains =~ s/^;[^;]+;/;/;
+			}
+			# Compare the two possible families (actual and previous family)
+			my $array_counter = 0;
+			while ($array_counter < $number_of_domains) {
+				if ($shoulds_prev_family =~ /$domains_array[$array_counter]/) {
+					if ($shoulds_act_family =~ /$domains_array[$array_counter]/) {
+
+						# If actual domain is in previous and actual family go to next domain
+						$array_counter++;
+						next;
+
+					}
+					# Protein was right classified in the previous classification. The actual entry will be ignored
+
+					$possibilities--;
+					last;
+				}
+
+				# If actual domain is in the actual family but not in the previous. The actual classification is right
+				if ($shoulds_act_family =~ /$domains_array[$array_counter]/) {
+
+					pop @family_classifications_file;
+					pop @subfamily_classifications_file;
+
+					# Merge "GARBP_ARR-B_Myb" and "GARP_ARR-B_G2" to "GARP_ARR-B"
+					if ( ($currentFamily eq "GARP_ARR-B_Myb") or ($currentFamily eq "GARP_ARR-B_G2")){
+
+						push @family_classifications_file, "$akt_entry;GARP_ARR-B;$possibilities$domains\n";
+						push @subfamily_classifications_file, "$akt_entry;GARP_ARR-B;-;$possibilities$domains\n";
+						if ($ARR_B_counter == 0) {
+							push @liste_alle_familien, 'GARP_ARR-B';
+							$ARR_B_counter++;
+						}
+					}
+					elsif ( ($currentFamily eq "bZIP1") or ($currentFamily eq "bZIP2") or ($currentFamily eq "bZIPAUREO") or ($currentFamily eq "bZIPCDD")) {
+
+						push @family_classifications_file, "$akt_entry;bZIP;$possibilities$domains\n";
+						push @subfamily_classifications_file, "$akt_entry;bZIP;-;$possibilities$domains\n";
+						if ($bZIP_counter == 0) {
+							push @liste_alle_familien, 'bZIP';
+							$bZIP_counter++;
+						}
+					}
+					elsif ( ($currentFamily eq "HRT") or ($currentFamily eq "GIY_YIG")) {
+						push @subfamily_classifications_file, "$akt_entry;ET;-;$possibilities$domains\n";
+						push @family_classifications_file, "$akt_entry;ET;$possibilities$domains\n";
+						if ($ET_counter == 0) {
+							push @liste_alle_familien, 'ET';
+							$ET_counter++;
+						}
+					}
+					else {
+
+						push @family_classifications_file, "$akt_entry;$currentFamily;$possibilities$domains\n";
+						push @subfamily_classifications_file, "$akt_entry;$currentFamily;-;$possibilities$domains\n";
+					}
+					$possibilities--;
+					last;
+
+				}
+
+					$array_counter++;
+			}
+
+		}
+		# Normal enty: if the family is classified for one family write it in the array
+		else {
+
+			# Store the matched domains of the actual family for respectively later doubly classification decision
+			$shoulds_prev_family = $shoulds_act_family;
+			# Store the entry in the output array
+			# Merge "GARBP_ARR-B_Myb" and "GARP_ARR-B_G2" to "GARP_ARR-B"
+			# Add GARP_ARR-B to @liste_alle_familien ONCE
+
+				if ( ($currentFamily eq "GARP_ARR-B_Myb") or ($currentFamily eq "GARP_ARR-B_G2")){
+					push @family_classifications_file, "$akt_entry;GARP_ARR-B;$possibilities$domains\n";
+					push @subfamily_classifications_file, "$akt_entry;GARP_ARR-B;-;$possibilities$domains\n";
+					if ($ARR_B_counter == 0) {
+						push @liste_alle_familien, 'GARP_ARR-B';
+						$ARR_B_counter++;
+					}
+				}
+				elsif (($currentFamily eq "bZIP1") or ($currentFamily eq "bZIP2") or ($currentFamily eq "bZIPAUREO") or ($currentFamily eq "bZIPCDD")){
+					push @family_classifications_file, "$akt_entry;bZIP;$possibilities$domains\n";
+					push @subfamily_classifications_file, "$akt_entry;bZIP;-;$possibilities$domains\n";
+					if ($bZIP_counter == 0) {
+						push @liste_alle_familien, 'bZIP';
+						$bZIP_counter++;
+					}
+				}
+				elsif (($currentFamily eq "HRT") or ($currentFamily eq "GIY_YIG")){
+					push @family_classifications_file, "$akt_entry;ET;$possibilities$domains\n";
+					push @subfamily_classifications_file, "$akt_entry;ET;-;$possibilities$domains\n";
+					if ($ET_counter == 0) {
+						push @liste_alle_familien, 'ET';
+						$ET_counter++;
+					}
+				}
+				else {
+
+				push @family_classifications_file, "$akt_entry;$currentFamily;$possibilities$domains\n";
+				push @subfamily_classifications_file, "$akt_entry;$currentFamily;-;$possibilities$domains\n";
+				}
+
+                	$classified_families++;
+			}
+
+		# Flag to mark that any family was found for the current protein
+		$member_found = 1;
+		}
+
+	}
+
+	if ($possibilities == 0) {
+
+		# If no family was found for the current protein give this out
+		push @family_classifications_file, "$akt_entry;0_no_family_found;0$domains\n";
+		push @subfamily_classifications_file, "$akt_entry;0_no_family_found;-;0$domains\n";
+		$unclassified_families++;
+	}
+
+	# Increment the number of possible members in the array at the right position
+	else {
+
+		my $temp = $entry_counter[$possibilities];
+		$temp++;
+		$entry_counter[$possibilities] = $temp;
+	}
+
+}
+# Define for which TAP families also subfamilies may be present:
+
+foreach my $subfamily_classifications_file (@subfamily_classifications_file) {
+
+	# If "C2H2" was assigned to a sequence, write "C2H2;C2H2" to output.3 as TAP family and subfamily, respectively.
+	if ($subfamily_classifications_file =~ /C2H2/ ) {
+		$subfamily_classifications_file =~ s/C2H2;-/C2H2;C2H2/
+	}
+	# If "C2H2_IDD" was assigned to a sequence, write "C2H2;C2H2_IDD" to output.3 as TAP family and subfamily, respectively.
+	if ($subfamily_classifications_file =~ /C2H2_IDD/ ) {
+		$subfamily_classifications_file =~ s/C2H2_IDD;-/C2H2;C2H2_IDD/
+	}
+	if ($subfamily_classifications_file =~ /bHLH/ ) {
+		$subfamily_classifications_file =~ s/bHLH;-/bHLH;bHLH/
+	}
+	if ($subfamily_classifications_file =~ /bHLH_TCP/ ) {
+		$subfamily_classifications_file =~ s/bHLH_TCP;-/bHLH;bHLH_TCP/
+	}
+	if ($subfamily_classifications_file =~ /NF-YA/ ) {
+		$subfamily_classifications_file =~ s/NF-YA;-/NFY;NF-YA/
+	}
+	if ($subfamily_classifications_file =~ /NF-YB/ ) {
+		$subfamily_classifications_file =~ s/NF-YB;-/NFY;NF-YB/
+	}
+	if ($subfamily_classifications_file =~ /NF-YC/ ) {
+		$subfamily_classifications_file =~ s/NF-YC;-/NFY;NF-YC/
+	}
+	if ($subfamily_classifications_file =~ /C1HDZ/ ) {
+		$subfamily_classifications_file =~ s/C1HDZ;-/HDZ;C1HDZ/
+	}
+	if ($subfamily_classifications_file =~ /C2HDZ/ ) {
+		$subfamily_classifications_file =~ s/C2HDZ;-/HDZ;C2HDZ/
+	}
+	if ($subfamily_classifications_file =~ /C3HDZ/ ) {
+		$subfamily_classifications_file =~ s/C3HDZ;-/HDZ;C3HDZ/
+	}
+	if ($subfamily_classifications_file =~ /C4HDZ/ ) {
+		$subfamily_classifications_file =~ s/C4HDZ;-/HDZ;C4HDZ/
+	}
+	if ($subfamily_classifications_file =~ /MYST/ ) {
+		$subfamily_classifications_file =~ s/MYST;-/HAT;MYST/
+	}
+	if ($subfamily_classifications_file =~ /CBP/ ) {
+		$subfamily_classifications_file =~ s/CBP;-/HAT;CBP/
+	}
+	if ($subfamily_classifications_file =~ /TAFII250/ ) {
+		$subfamily_classifications_file =~ s/TAFII250;-/HAT;TAFII250/
+	}
+	if ($subfamily_classifications_file =~ /GNAT/ ) {
+		$subfamily_classifications_file =~ s/GNAT;-/HAT;GNAT/
+	}
+	if ($subfamily_classifications_file =~ /LOB1/ ) {
+		$subfamily_classifications_file =~ s/LOB1;-/LBD;LOB1/
+	}
+	if ($subfamily_classifications_file =~ /LOB2/ ) {
+		$subfamily_classifications_file =~ s/LOB2;-/LBD;LOB2/
+	}
+	if ($subfamily_classifications_file =~ /MYB-related/ ) {
+		$subfamily_classifications_file =~ s/MYB-related;-/MYB;MYB-related/
+	}
+	if ($subfamily_classifications_file =~ /SWI\/SNF_SWI3/ ) {
+		$subfamily_classifications_file =~ s/SWI\/SNF_SWI3;-/MYB-related;SWI\/SNF_SWI3/
+	}
+	if ($subfamily_classifications_file =~ /MYB-2R/ ) {
+		$subfamily_classifications_file =~ s/MYB-2R;-/MYB;MYB-2R/
+	}
+	if ($subfamily_classifications_file =~ /MYB-3R/ ) {
+		$subfamily_classifications_file =~ s/MYB-3R;-/MYB;MYB-3R/
+	}
+	if ($subfamily_classifications_file =~ /MYB-4R/ ) {
+		$subfamily_classifications_file =~ s/MYB-4R;-/MYB;MYB-4R/
+	}
+	if ($subfamily_classifications_file =~ /RKD/ ) {
+		$subfamily_classifications_file =~ s/RKD;-/RWP-RK;RKD/
+	}
+	if ($subfamily_classifications_file =~ /NLP/ ) {
+		$subfamily_classifications_file =~ s/NLP;-/RWP-RK;NLP/
+	}
+	if ($subfamily_classifications_file =~ /AP2/ ) {
+		$subfamily_classifications_file =~ s/AP2;-/AP2;AP2/
+	}
+	if ($subfamily_classifications_file =~ /CRF/ ) {
+		$subfamily_classifications_file =~ s/CRF;-/AP2;CRF/
+	}
+}
+
+### 4.1 Filter for gene models
+
+# Filter for gene models. All alternative splice variants will be removed from Arabidopsis and rice entries
+shift @family_classifications_file;
+shift @subfamily_classifications_file;
+# Sort the array for filter and a sorted output
+@family_classifications_file = sort @family_classifications_file;
+@subfamily_classifications_file = sort @subfamily_classifications_file;
+if ($gene_model_filter and $gene_model_filter eq "filter") {
+	print "*** filtering gene models: removing splice variants ***\n";
+
+	# temporary array for the filtered entries
+	my @filtered_entries = [];
+	my $models_removed_counter = 0;
+
+	my $prev_entry = "";
+	foreach my $entry_line (@family_classifications_file) {
+		if ($entry_line !~ /^([^.]+).\d+/) {
+			print "Bad format for filtering splice variants. Use \"filter\" only for TAIR (Aradopsis) or TIGR (Rice) files.\n\n";
+			exit;
+		}
+		if ($1 eq $prev_entry) {
+			$models_removed_counter++;
+			next;
+		}
+		$prev_entry = $1;
+		push @filtered_entries, $entry_line;
+	}
+	print " -> $models_removed_counter gene models were removed\n\n";
+	@family_classifications_file = @filtered_entries;
+	shift @family_classifications_file;
+}
+
+# Print the results in the array to output.1 and output.3 and push the names of the TAP families and Subfamilies into $family_list to create output.2
+foreach my $fcf_line (@family_classifications_file) {
+	$fcf_line =~ /^[^;]+;([^;]+)/;
+	#push @family_list, "$1";
+	print FAMILY_CLASSIFICATIONS "$fcf_line";
+}
+close (FAMILY_CLASSIFICATIONS);
+
+foreach my $fcf_line (@subfamily_classifications_file) {
+	$fcf_line =~ /^[^;]+;([^;]+);([^;]+)/;
+	if ($2 eq "-") {
+		push @family_list, "$1";
+	}
+	else {
+		push @family_list, "$2";
+	}
+	print SUBFAMILY_CLASSIFICATIONS "$fcf_line";
+}
+
+close (SUBFAMILY_CLASSIFICATIONS);
+
+print "*** calculating the family statistics and write it in $family_statistics ***\n\n";
+
+##################################
+### 5. Create the output files ###
+##################################
+
+my $statistics_outputfile = "$family_statistics";
+
+unless (open(FAMILY_STATISTICS, ">$statistics_outputfile")) {
+	print "Cannot open file \"$statistics_outputfile\" to write to!!\n\n";
+	exit;
+}
+
+# Count the family entries
+my @output_family_statistics = ();
+my @gefundene_familien = ();
+my $family_counter = 1;
+
+shift @family_list;
+@family_list = sort @family_list;
+
+
+my $old_family = "";
+push @family_list, 'BAD FIX'; # Makes to loop go through every fam and stops at non fam(BAD FIX).
+
+foreach my $line (@family_list) {
+	if ($line eq $old_family) {
+		$family_counter++;
+	}
+	elsif ($old_family ne "") {
+		push (@output_family_statistics,"$old_family;$family_counter\n");
+		# Add all found families to a list of found families
+		push (@gefundene_familien,"$old_family");
+	        $family_counter=1;
+	}
+	$old_family = $line;
+}
+
+my %hash = ();
+
+# Put all families from the classifictaion ruled and all found families in a hash
+foreach my $element (@gefundene_familien,@liste_alle_familien) {$hash{$element}++;}
+
+# Remove merged families
+delete $hash{'GARP_ARR-B_Myb'};
+delete $hash{'GARP_ARR-B_G2'};
+delete $hash{'bZIP1'};
+delete $hash{'bZIP2'};
+delete $hash{'bZIPAUREO'};
+delete $hash{'bZIPCDD'};
+delete $hash{'HRT'};
+delete $hash{'GIY_YIG'};
+
+# Add all not found families from the classification rules to @output_family_statistics
+# With zero as number of families found
+
+foreach my $element (keys %hash) {
+	if (($hash{$element} == 1) and ( $element eq "0_no_family_found")) {
+	push (@output_family_statistics,"$element;$unclassified_families\n");
+	}
+	if (($hash{$element} == 1) and ($element ne "0_no_family_found")) {
+	push (@output_family_statistics,"$element;0\n");
+	}
+}
+
+# Sort @output_family_statistics caseinsensitive-alphabetically
+my @sortierte_statistik = sort {lc $a cmp lc $b} @output_family_statistics;
+
+# Print headline to @sortierte_statistik
+unshift (@sortierte_statistik,"family statistics for $hmmsearch_output\n");
+
+print FAMILY_STATISTICS @sortierte_statistik;
+
+# Print FAMILY_STATISTICS "$old_family;$family_counter\n";
+
+close (FAMILY_STATISTICS);
+
+#################################################
+### 6. Give out some statistical informations ###
+#################################################
+
+$entry_counter [0] = 0;
+my $sum = 0;
+foreach my $entry (@entry_counter) {
+	$sum += $entry;
+}
+print "$classified_families classifications were found for $sum proteins.\n";
+print "This classifications are divided in:\n";
+my $count = 0;
+foreach my $element (@entry_counter) {
+	if ($count != 0) {
+		print "$element proteins were classified for $count";
+		if ($count == 1) {print " family\n";}
+		else {print " different families\n";}
+	}
+	$count++;
+}
+print "\n$unclassified_families proteins could not be classified\n\n";
+
+print "*** The results were written in $family_classifications and $subfamily_classifications ***\n";
+print "*** done ***\n\n";
+
+
+exit;
+
+sub get_file_data {
+
+	my ($filename) = @_;
+
+	use strict;
+	use warnings;
+
+	my @filedata = ();
+
+	unless( open(GET_FILE_DATA, $filename)) {
+		print STDERR "Cannot open file \"$filename\"n\n";
+		exit;
+	}
+
+	@filedata = <GET_FILE_DATA>;
+
+	close GET_FILE_DATA;
+
+	return @filedata;
+}
+
+
--- a/tapscan_coverage_values_v10.txt	Wed Feb 14 13:54:16 2024 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,154 +0,0 @@
-Acetyltransf_1	0.0931034483
-AP2	0.3214285714
-ARID	0.0516393443
-AUX_IAA	0.1077981651
-Auxin_resp	0.4308510638
-B3	0.196875
-BES1_N	0.3888888889
-BSD	0.485915493
-BTB	0.3105095541
-bZIP_1	0.64453125
-bZIP_2	0.5948275862
-C1_2	0.1612903226
-CAF1C_H4-bd	0.1764705882
-CBFB_NFYA	0.2019230769
-CCT	0.4952830189
-CG-1	0.5568181818
-CSD	0.5878378378
-DDT	0.2678571429
-DEAD	0.1115591398
-dsrm	0.2330097087
-DUF260	0.3638059701
-DUF296	0.155075188
-DUF547	0.1181672026
-DUF573	0.3515625
-DUF632	0.3302603037
-DUF702	0.2314126394
-E2F_TDP	0.2078313253
-EIN3	0.5874125874
-FHA	0.1777456647
-FLO_LFY	0.4569138277
-FYRC	0.3928571429
-FYRN	0.46875
-GAGA_bind	0.2265774379
-GATA	0.5406976744
-GRAS	0.3990825688
-Helicase_C	0.118852459
-HLH	0.3191489362
-HMG_box	0.6866197183
-Homeobox	0.5485074627
-HSF_DNA-bind	0.071641791
-IQ	0.6428571429
-JmjC	0.5058139535
-JmjN	0.4261363636
-K-box	0.438
-KNOX1	0.578125
-KNOX2	0.6346153846
-LIM	0.4726027397
-MBF1	0.3049450549
-Med26	0.1939655172
-Med31	0.451048951
-Med6	0.1577868852
-Med7	0.1818181818
-MEKHLA	0.4357541899
-mTERF	0.4850543478
-Myb_DNA-binding	0.3636363636
-NAM	0.0142857143
-O-FucT	0.0914866582
-Ovate	0.3137755102
-PAH	0.1194267516
-PAZ	0.1564569536
-PC4	0.3848684211
-PHD	0.37
-Piwi	0.4082278481
-PLATZ	0.30625
-PP2C	0.4110962567
-QLQ	0.61875
-RB_B	0.1663987138
-Rcd1	0.4131355932
-Response_reg	0.5017241379
-RFX_DNA_binding	0.5257731959
-RHD_DNA_bind	0.4581447964
-Ribonuclease_3	0.0935013263
-RRN3	0.2342427093
-Runt	0.5714285714
-RWP-RK	0.4542253521
-S1FA	0.7386363636
-SBP	0.4274193548
-SET	0.0392857143
-SH2	0.4197247706
-Sigma70_r2	0.1675531915
-Sigma70_r3	0.6346153846
-Sigma70_r4	0.6388888889
-SIR2	0.45703125
-SNF2_N	0.1150895141
-SRF-TF	0.406779661
-SSXT	0.5856164384
-START	0.5067567568
-STAT_bind	0.3781869688
-SWIB	0.3267326733
-SWIRM	0.2532467532
-TANGO2	0.125
-TCP	0.219665272
-TCR	0.4943181818
-TEA	0.4078282828
-TF_AP-2	0.5660377358
-Tfb2	0.1757668712
-tify	0.5955882353
-Tub	0.0872576177
-VEFS-Box	0.4918831169
-WD40	0.0563909774
-WHIM1	0.5263157895
-Whirly	0.5648148148
-WRC	0.6223404255
-WRKY	0.2196969697
-WSD	0.1768867925
-YABBY	0.2804621849
-zf-AN1	0.4027777778
-zf-B_box	0.2746478873
-zf-C2H2	0.5080645161
-zf-C5HC2	0.253125
-zf-CCCH	0.5689655172
-zf-Dof	0.5955882353
-ZF-HD_dimer	0.5294117647
-zf-MIZ	0.6805555556
-zf-TAZ	0.1566455696
-zf-ZPR1	0.1608910891
-Zn_clus	0.5480769231
-Alfin-like	0.75
-BEL	0.75
-DNC	0.75
-FIE_clipped_for_HMM	0.75
-G2-like_Domain	0.75
-HRT	0.75
-KNOXC	0.75
-LUFS_Domain	0.75
-NF-YB	0.75
-NF-YC	0.75
-NOZZLE	0.75
-PINTOX	0.75
-STER_AP	0.75
-trihelix	0.75
-ULT_Domain	0.75
-VARL	0.75
-VOZ_Domain	0.75
-WOX_HD	0.75
-CXC	0.75
-bZIP_AUREO	0.75
-bZIP_CDD	0.50
-ALOG	0.75
-C2H2-IDD	0.75
-zf-MYST	0.75
-CBP	0.75
-DUF3591	0.75
-LOB2	0.75
-zz-ADA2	0.75
-NLP	0.75
-CRF	0.75
-GIY_YIG	0.75
-ZPR	0.75
-LD	0.75
-NDX	0.75
-SAWADEE	0.75
-C1HDZ	0.75
-C2HDZ	0.75
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tapscan_coverage_values_v11.txt	Thu Feb 22 10:07:53 2024 +0000
@@ -0,0 +1,155 @@
+Acetyltransf_1	0.0931034483
+AP2	0.3214285714
+ARID	0.0516393443
+AUX_IAA	0.1077981651
+Auxin_resp	0.4308510638
+B3	0.196875
+BES1_N	0.3888888889
+BSD	0.485915493
+BTB	0.3105095541
+bZIP_1	0.64453125
+bZIP_2	0.5948275862
+C1_2	0.1612903226
+CAF1C_H4-bd	0.1764705882
+CBFB_NFYA	0.2019230769
+CCT	0.4952830189
+CG-1	0.5568181818
+CSD	0.5878378378
+DDT	0.2678571429
+DEAD	0.1115591398
+dsrm	0.2330097087
+DUF260	0.3638059701
+DUF296	0.155075188
+DUF547	0.1181672026
+DUF573	0.3515625
+DUF632	0.3302603037
+DUF702	0.2314126394
+E2F_TDP	0.2078313253
+EIN3	0.5874125874
+FHA	0.1777456647
+FLO_LFY	0.4569138277
+FYRC	0.3928571429
+FYRN	0.46875
+GAGA_bind	0.2265774379
+GATA	0.5406976744
+GRAS	0.3990825688
+Helicase_C	0.118852459
+HLH	0.3191489362
+HMG_box	0.6866197183
+Homeobox	0.5485074627
+HSF_DNA-bind	0.071641791
+IQ	0.6428571429
+JmjC	0.5058139535
+JmjN	0.4261363636
+K-box	0.438
+KNOX1	0.578125
+KNOX2	0.6346153846
+LIM	0.4726027397
+MBF1	0.3049450549
+Med26	0.1939655172
+Med31	0.451048951
+Med6	0.1577868852
+Med7	0.1818181818
+MEKHLA	0.4357541899
+mTERF	0.4850543478
+Myb_DNA-binding	0.3636363636
+NAM	0.0142857143
+O-FucT	0.0914866582
+Ovate	0.3137755102
+PAH	0.1194267516
+PAZ	0.1564569536
+PC4	0.3848684211
+PHD	0.37
+Piwi	0.4082278481
+PLATZ	0.30625
+PP2C	0.4110962567
+QLQ	0.61875
+RB_B	0.1663987138
+Rcd1	0.4131355932
+Response_reg	0.5017241379
+RFX_DNA_binding	0.5257731959
+RHD_DNA_bind	0.4581447964
+Ribonuclease_3	0.0935013263
+RRN3	0.2342427093
+Runt	0.5714285714
+RWP-RK	0.4542253521
+S1FA	0.7386363636
+SBP	0.4274193548
+SET	0.0392857143
+SH2	0.4197247706
+Sigma70_r2	0.1675531915
+Sigma70_r3	0.6346153846
+Sigma70_r4	0.6388888889
+SIR2	0.45703125
+SNF2_N	0.1150895141
+SRF-TF	0.406779661
+SSXT	0.5856164384
+START	0.5067567568
+STAT_bind	0.3781869688
+SWIB	0.3267326733
+SWIRM	0.2532467532
+TANGO2	0.125
+TCP	0.219665272
+TCR	0.4943181818
+TEA	0.4078282828
+TF_AP-2	0.5660377358
+Tfb2	0.1757668712
+tify	0.5955882353
+Tub	0.0872576177
+VEFS-Box	0.4918831169
+WD40	0.0563909774
+WHIM1	0.5263157895
+Whirly	0.5648148148
+WRC	0.6223404255
+WRKY	0.2196969697
+WSD	0.1768867925
+YABBY	0.2804621849
+zf-AN1	0.4027777778
+zf-B_box	0.2746478873
+zf-C2H2	0.5080645161
+zf-C5HC2	0.253125
+zf-CCCH	0.5689655172
+zf-Dof	0.5955882353
+ZF-HD_dimer	0.5294117647
+zf-MIZ	0.6805555556
+zf-TAZ	0.1566455696
+zf-ZPR1	0.1608910891
+Zn_clus	0.5480769231
+Alfin-like	0.75
+BEL	0.75
+DNC	0.75
+FIE_clipped_for_HMM	0.75
+G2-like_Domain	0.75
+HRT	0.75
+KNOXC	0.75
+LUFS_Domain	0.75
+NF-YB	0.75
+NF-YC	0.75
+NOZZLE	0.75
+PINTOX	0.75
+STER_AP	0.75
+trihelix	0.75
+ULT_Domain	0.75
+VARL	0.75
+VOZ_Domain	0.75
+WOX_HD	0.75
+CXC	0.75
+bZIP_AUREO	0.75
+bZIP_CDD	0.50
+ALOG	0.75
+C2H2-IDD	0.75
+zf-MYST	0.75
+CBP	0.75
+DUF3591	0.75
+LOB2	0.75
+zz-ADA2	0.75
+NLP	0.75
+CRF	0.75
+GIY_YIG	0.75
+ZPR	0.75
+LD	0.75
+NDX	0.75
+SAWADEE	0.75
+C1HDZ	0.75
+C2HDZ	0.75
+Homeobox_KN	0.75
Binary file tapscan_domains_v12.txt.gz has changed
Binary file tapscan_domains_v13.txt.gz has changed
--- a/tapscan_rules_v81.txt	Wed Feb 14 13:54:16 2024 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,298 +0,0 @@
-ABI3/VP1;AP2;should not
-ABI3/VP1;Auxin_resp;should not
-ABI3/VP1;B3;should
-ABI3/VP1;WRKY;should not
-Alfin-like;Alfin-like;should
-Alfin-like;Homeobox;should not
-Alfin-like;zf-TAZ;should not
-Alfin-like;PHD;should not
-AP2;AP2;should
-AP2;CRF;should not
-ARF;Auxin_resp;should
-Argonaute;Piwi;should
-Argonaute;PAZ;should
-ARID;ARID;should
-Aux/IAA;AUX_IAA;should
-Aux/IAA;Auxin_resp;should not
-Aux/IAA;B3;should not
-BBR/BPC;GAGA_bind;should
-BES1;BES1_N;should
-bHLH;HLH;should
-bHLH;TCP;should not
-bHLH_TCP;TCP;should
-bHSH;TF_AP-2;should
-BSD domain containing;BSD;should
-bZIP1;bZIP_1;should
-bZIP1;HLH;should not
-bZIP1;Homeobox;should not
-bZIP2;bZIP_2;should
-bZIP2;HLH;should not
-bZIP2;Homeobox;should not
-bZIPAUREO;bZIP_AUREO;should
-bZIPAUREO;HLH;should not
-bZIPAUREO;Homeobox;should not
-bZIPCDD;bZIP_CDD;should
-bZIPCDD;HLH;should not
-bZIPCDD;Homeobox;should not
-C2C2_CO-like;CCT;should
-C2C2_CO-like;GATA;should not
-C2C2_CO-like;tify;should not
-C2C2_CO-like;PLATZ;should not
-C2C2_CO-like;zf-B_box;should
-C2C2_Dof;zf-Dof;should
-C2C2_Dof;GATA;should not
-C2C2_GATA;GATA;should
-C2C2_GATA;tify;should not
-C2C2_GATA;zf-Dof;should not
-C2C2_YABBY;YABBY;should
-C2H2;zf-C2H2;should
-C2H2;zf-MIZ;should not
-C3H;AP2;should not
-C3H;SRF-TF;should not
-C3H;MYB-2R;should not
-C3H;MYB-3R;should not
-C3H;MYB-4R;should not
-C3H;zf-C2H2;should not
-C3H;zf-CCCH;should
-CAMTA;CG-1;should
-CAMTA;IQ;should
-NF-YA;bZIP_1;should not
-NF-YA;bZIP_2;should not
-NF-YA;CBFB_NFYA;should
-NF-YB;NF-YB;should
-NF-YB;NF-YC;should not
-NF-YC;NF-YB;should not
-NF-YC;NF-YC;should
-NF-YC;HMG_box;should not
-Coactivator p15;PC4;should
-CPP;TCR;should
-CSD;CSD;should
-CudA;STAT_bind;should
-CudA;SH2;should
-DBP;DNC;should
-DBP;PP2C;should
-DDT;DDT;should
-DDT;Homeobox;should not
-DDT;Alfin-like;should not
-Dicer;Piwi;should not
-Dicer;DEAD;should
-Dicer;Helicase_C;should
-Dicer;Ribonuclease_3;should
-Dicer;dsrm;should
-DUF246 domain containing/O-FucT;O-FucT;should
-DUF296 domain containing;DUF296;should
-DUF547 domain containing;DUF547;should
-DUF632 domain containing;DUF632;should
-DUF833 domain containing/TANGO2;TANGO2;should
-E2F/DP;E2F_TDP;should
-EIL;EIN3;should
-FHA;FHA;should
-GARP_ARR-B_G2;CCT;should not
-GARP_ARR-B_G2;G2-like_Domain;should
-GARP_ARR-B_G2;Response_reg;should
-GARP_ARR-B_Myb;CCT;should not
-GARP_ARR-B_Myb;Response_reg;should
-GARP_ARR-B_Myb;Myb_DNA-binding;should
-GARP_G2-like;G2-like_Domain;should
-GARP_G2-like;Response_reg;should not
-GARP_G2-like;Myb_DNA-binding;should not
-GeBP;DUF573;should
-GIF;SSXT;should
-GNAT;Acetyltransf_1;should
-GNAT;PHD;should not
-GRAS;GRAS;should
-GRF;QLQ;should
-GRF;WRC;should
-C3HDZ;Homeobox;should
-C3HDZ;START;should
-C3HDZ;MEKHLA;should
-C4HDZ;Homeobox;should
-C4HDZ;START;should
-C4HDZ;MEKHLA;should not
-HD_PLINC;ZF-HD_dimer;should
-HD_WOX;WOX_HD;should
-HD_DDT;Homeobox;should
-HD_DDT;DDT;should
-HD_DDT;WHIM1;should
-HD_DDT;WSD;should
-HD_PHD;PHD;should
-HD_PHD;Homeobox;should
-HD_PINTOX;Homeobox;should
-HD_PINTOX;PINTOX;should
-HD_BEL;Homeobox;should
-HD_BEL;BEL;should
-HD_KNOX1;Homeobox;should
-HD_KNOX1;KNOX1;should
-HD_KNOX1;KNOX2;should
-HD_KNOX1;KNOXC;should not
-HD_KNOX2;Homeobox;should
-HD_KNOX2;KNOX1;should
-HD_KNOX2;KNOX2;should
-HD_KNOX2;KNOXC;should
-HD-other;EIN3;should not
-HD-other;Homeobox;should
-HD-other;bZIP_1;should not
-HD-other;WOX_HD;should not
-HD-other;PINTOX;should not
-HD-other;PHD;should not
-HD-other;BEL;should not
-HMG;ARID;should not
-HMG;HMG_box;should
-HMG;YABBY;should not
-HRT;HRT;should
-HSF;HSF_DNA-bind;should
-IWS1;Med26;should
-Jumonji_PKDM7;JmjC;should
-Jumonji_PKDM7;JmjN;should
-Jumonji_PKDM7;zf-C5HC2;should
-Jumonji_PKDM7;FYRN;should
-Jumonji_PKDM7;FYRC;should
-Jumonji_Other;JmjC;should
-LFY;FLO_LFY;should
-LIM;two_or_more_LIM;should
-LUG;LUFS_Domain;should
-MADS;SRF-TF;should
-MADS;K-box;should not
-MADS_MIKC;SRF-TF;should
-MADS_MIKC;K-box;should
-MBF1;MBF1;should
-Med6;Med6;should
-Med7;Med7;should
-mTERF;mTERF;should
-MYB-2R;G2-like_Domain;should not
-MYB-2R;Response_reg;should not
-MYB-2R;trihelix;should not
-MYB-2R;MYB-2R;should
-MYB-3R;G2-like_Domain;should not
-MYB-3R;Response_reg;should not
-MYB-3R;trihelix;should not
-MYB-3R;MYB-3R;should
-MYB-4R;G2-like_Domain;should not
-MYB-4R;Response_reg;should not
-MYB-4R;trihelix;should not
-MYB-4R;MYB-4R;should
-MYB-related;ARID;should not
-MYB-related;G2-like_Domain;should not
-MYB-related;Myb_DNA-binding;should
-MYB-related;Response_reg;should not
-MYB-related;trihelix;should not
-MYB-related;MYB-2R;should not
-MYB-related;MYB-3R;should not
-MYB-related;MYB-4R;should not
-NAC;NAM;should
-NZZ;NOZZLE;should
-OFP;Ovate;should
-PcG_EZ;CXC;should
-PcG_EZ;SET;should
-PcG_FIE;FIE_clipped_for_HMM;should
-PcG_FIE;WD40;should
-PcG_VEFS;VEFS-Box;should
-PcG_VEFS;zf-C2H2;should not
-PcG_MSI;WD40;should
-PcG_MSI;CAF1C_H4-bd;should
-PcG_MSI;FIE_clipped_for_HMM;should not
-PHD;Myb_DNA-binding;should not
-PHD;Alfin-like;should not
-PHD;ARID;should not
-PHD;DDT;should not
-PHD;Homeobox;should not
-PHD;JmjC;should not
-PHD;JmjN;should not
-PHD;PHD;should
-PHD;SWIB;should not
-PHD;zf-TAZ;should not
-PHD;zf-MIZ;should not
-PHD;zf-CCCH;should not
-PHD;HMG_box;should not
-PLATZ;PLATZ;should
-Pseudo ARR-B;CCT;should
-Pseudo ARR-B;Response_reg;should
-Pseudo ARR-B;tify;should not
-RB;RB_B;should
-Rcd1-like;Rcd1;should
-Rel;RHD_DNA_bind;should
-RF-X;RFX_DNA_binding;should
-RRN3;RRN3;should
-Runt;Runt;should
-S1Fa-like;S1FA;should
-SAP;STER_AP;should
-SBP;SBP;should
-SET;zf-C2H2;should not
-SET;TCR;should not
-SET;CXC;should not
-SET;PHD;should not
-SET;Myb_DNA-binding;should not
-SET;SET;should
-Sigma70-like;Sigma70_r2;should
-Sigma70-like;Sigma70_r3;should
-Sigma70-like;Sigma70_r4;should
-Sin3;PAH;should
-Sin3;WRKY;should not
-Sir2;SIR2;should
-SOH1;Med31;should
-SRS;DUF702;should
-SWI/SNF_BAF60b;SWIB;should
-SWI/SNF_SNF2;AP2;should not
-SWI/SNF_SNF2;PHD;should not
-SWI/SNF_SNF2;SNF2_N;should
-SWI/SNF_SNF2;zf-CCCH;should not
-SWI/SNF_SNF2;Myb_DNA-binding;should not
-SWI/SNF_SNF2;HMG_box;should not
-SWI/SNF_SWI3;SWIRM;should
-SWI/SNF_SWI3;Myb_DNA-binding;should
-TEA;TEA;should
-TFb2;Tfb2;should
-tify;tify;should
-TRAF;BTB;should
-TRAF;zf-TAZ;should not
-Trihelix;trihelix;should
-TUB;Tub;should
-ULT;ULT_Domain;should
-VARL;VARL;should
-VOZ;VOZ_Domain;should
-Whirly;Whirly;should
-WRKY;WRKY;should
-Zinc finger, AN1 and A20 type;zf-AN1;should
-Zinc finger, AN1 and A20 type;zf-C2H2;should not
-Zinc finger, MIZ type;zf-MIZ;should
-Zinc finger, MIZ type;zf-C2H2;should not
-Zinc finger, ZPR1;zf-ZPR1;should
-Zn_clus;Zn_clus;should
-ALOG;ALOG;should
-C2H2;C2H2-IDD;should not
-C2H2_IDD;C2H2-IDD;should
-C2H2_IDD;zf-C2H2;should
-MYST;zf-MYST;should
-CBP;CBP;should
-CBP;zf-TAZ;should
-CBP;BTB;should not
-TAFII250;DUF3591;should
-LOB1;bZIP_1;should not
-LOB1;bZIP_2;should not
-LOB1;DUF260;should
-LOB1;HLH;should not
-LOB1;Homeobox;should not
-LOB2;LOB2;should
-LDL/FLD;SWIRM;should
-LDL/FLD;Myb_DNA-binding;should not
-ADA2;zz-ADA2;should
-ADA2;Myb_DNA-binding;should
-RKD;RWP-RK;should
-RKD;NLP;should not
-NLP;RWP-RK;should
-NLP;NLP;should
-CRF;CRF;should
-CRF;AP2;should
-GIY_YIG;GIY_YIG;should
-ZPR;ZPR;should
-HD-LD;LD;should
-HD-NDX;NDX;should
-HD-SAWADEE;SAWADEE;should
-C1HDZ;C1HDZ;should
-C1HDZ;Homeobox;should
-C1HDZ;START;should not
-C1HDZ;MEKHLA;should not
-C2HDZ;C2HDZ;should
-C2HDZ;Homeobox;should
-C2HDZ;START;should not
-C2HDZ;MEKHLA;should not
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tapscan_rules_v82.txt	Thu Feb 22 10:07:53 2024 +0000
@@ -0,0 +1,302 @@
+ABI3/VP1;AP2;should not
+ABI3/VP1;Auxin_resp;should not
+ABI3/VP1;B3;should
+ABI3/VP1;WRKY;should not
+Alfin-like;Alfin-like;should
+Alfin-like;Homeobox;should not
+Alfin-like;zf-TAZ;should not
+Alfin-like;PHD;should not
+AP2;AP2;should
+AP2;CRF;should not
+ARF;Auxin_resp;should
+Argonaute;Piwi;should
+Argonaute;PAZ;should
+ARID;ARID;should
+Aux/IAA;AUX_IAA;should
+Aux/IAA;Auxin_resp;should not
+Aux/IAA;B3;should not
+BBR/BPC;GAGA_bind;should
+BES1;BES1_N;should
+bHLH;HLH;should
+bHLH;TCP;should not
+bHLH_TCP;TCP;should
+bHSH;TF_AP-2;should
+BSD domain containing;BSD;should
+bZIP1;bZIP_1;should
+bZIP1;HLH;should not
+bZIP1;Homeobox;should not
+bZIP2;bZIP_2;should
+bZIP2;HLH;should not
+bZIP2;Homeobox;should not
+bZIPAUREO;bZIP_AUREO;should
+bZIPAUREO;HLH;should not
+bZIPAUREO;Homeobox;should not
+bZIPCDD;bZIP_CDD;should
+bZIPCDD;HLH;should not
+bZIPCDD;Homeobox;should not
+C2C2_CO-like;CCT;should
+C2C2_CO-like;GATA;should not
+C2C2_CO-like;tify;should not
+C2C2_CO-like;PLATZ;should not
+C2C2_CO-like;zf-B_box;should
+C2C2_Dof;zf-Dof;should
+C2C2_Dof;GATA;should not
+C2C2_GATA;GATA;should
+C2C2_GATA;tify;should not
+C2C2_GATA;zf-Dof;should not
+C2C2_YABBY;YABBY;should
+C2H2;zf-C2H2;should
+C2H2;zf-MIZ;should not
+C3H;AP2;should not
+C3H;SRF-TF;should not
+C3H;MYB-2R;should not
+C3H;MYB-3R;should not
+C3H;MYB-4R;should not
+C3H;zf-C2H2;should not
+C3H;zf-CCCH;should
+CAMTA;CG-1;should
+CAMTA;IQ;should
+NF-YA;bZIP_1;should not
+NF-YA;bZIP_2;should not
+NF-YA;CBFB_NFYA;should
+NF-YB;NF-YB;should
+NF-YB;NF-YC;should not
+NF-YC;NF-YB;should not
+NF-YC;NF-YC;should
+NF-YC;HMG_box;should not
+Coactivator p15;PC4;should
+CPP;TCR;should
+CSD;CSD;should
+CudA;STAT_bind;should
+CudA;SH2;should
+DBP;DNC;should
+DBP;PP2C;should
+DDT;DDT;should
+DDT;Homeobox;should not
+DDT;Alfin-like;should not
+Dicer;Piwi;should not
+Dicer;DEAD;should
+Dicer;Helicase_C;should
+Dicer;Ribonuclease_3;should
+Dicer;dsrm;should
+DUF246 domain containing/O-FucT;O-FucT;should
+DUF296 domain containing;DUF296;should
+DUF547 domain containing;DUF547;should
+DUF632 domain containing;DUF632;should
+DUF833 domain containing/TANGO2;TANGO2;should
+E2F/DP;E2F_TDP;should
+EIL;EIN3;should
+FHA;FHA;should
+GARP_ARR-B_G2;CCT;should not
+GARP_ARR-B_G2;G2-like_Domain;should
+GARP_ARR-B_G2;Response_reg;should
+GARP_ARR-B_Myb;CCT;should not
+GARP_ARR-B_Myb;Response_reg;should
+GARP_ARR-B_Myb;Myb_DNA-binding;should
+GARP_G2-like;G2-like_Domain;should
+GARP_G2-like;Response_reg;should not
+GARP_G2-like;Myb_DNA-binding;should not
+GeBP;DUF573;should
+GIF;SSXT;should
+GNAT;Acetyltransf_1;should
+GNAT;PHD;should not
+GRAS;GRAS;should
+GRF;QLQ;should
+GRF;WRC;should
+C3HDZ;Homeobox;should
+C3HDZ;START;should
+C3HDZ;MEKHLA;should
+C4HDZ;Homeobox;should
+C4HDZ;START;should
+C4HDZ;MEKHLA;should not
+HD_PLINC;ZF-HD_dimer;should
+HD_WOX;WOX_HD;should
+HD_DDT;Homeobox;should
+HD_DDT;DDT;should
+HD_DDT;WHIM1;should
+HD_DDT;WSD;should
+HD_PHD;PHD;should
+HD_PHD;Homeobox;should
+HD_PINTOX;Homeobox;should
+HD_PINTOX;PINTOX;should
+HD_TALE_BEL;Homeobox_KN;should
+HD_TALE_BEL;BEL;should
+HD_TALE_KNOX1;Homeobox_KN;should
+HD_TALE_KNOX1;KNOX1;should
+HD_TALE_KNOX1;KNOX2;should
+HD_TALE_KNOX1;KNOXC;should not
+HD_TALE_KNOX2;Homeobox_KN;should
+HD_TALE_KNOX2;KNOX1;should
+HD_TALE_KNOX2;KNOX2;should
+HD_TALE_KNOX2;KNOXC;should
+HD-other;EIN3;should not
+HD-other;Homeobox;should
+HD-other;bZIP_1;should not
+HD-other;WOX_HD;should not
+HD-other;PINTOX;should not
+HD-other;PHD;should not
+HD-other;BEL;should not
+HD_TALE;Homeobox_KN;should
+HD_TALE;BEL;should not
+HD_TALE;KNOX1;should not
+HD_TALE;KNOX2;should not
+HMG;ARID;should not
+HMG;HMG_box;should
+HMG;YABBY;should not
+HRT;HRT;should
+HSF;HSF_DNA-bind;should
+IWS1;Med26;should
+Jumonji_PKDM7;JmjC;should
+Jumonji_PKDM7;JmjN;should
+Jumonji_PKDM7;zf-C5HC2;should
+Jumonji_PKDM7;FYRN;should
+Jumonji_PKDM7;FYRC;should
+Jumonji_Other;JmjC;should
+LFY;FLO_LFY;should
+LIM;two_or_more_LIM;should
+LUG;LUFS_Domain;should
+MADS;SRF-TF;should
+MADS;K-box;should not
+MADS_MIKC;SRF-TF;should
+MADS_MIKC;K-box;should
+MBF1;MBF1;should
+Med6;Med6;should
+Med7;Med7;should
+mTERF;mTERF;should
+MYB-2R;G2-like_Domain;should not
+MYB-2R;Response_reg;should not
+MYB-2R;trihelix;should not
+MYB-2R;MYB-2R;should
+MYB-3R;G2-like_Domain;should not
+MYB-3R;Response_reg;should not
+MYB-3R;trihelix;should not
+MYB-3R;MYB-3R;should
+MYB-4R;G2-like_Domain;should not
+MYB-4R;Response_reg;should not
+MYB-4R;trihelix;should not
+MYB-4R;MYB-4R;should
+MYB-related;ARID;should not
+MYB-related;G2-like_Domain;should not
+MYB-related;Myb_DNA-binding;should
+MYB-related;Response_reg;should not
+MYB-related;trihelix;should not
+MYB-related;MYB-2R;should not
+MYB-related;MYB-3R;should not
+MYB-related;MYB-4R;should not
+NAC;NAM;should
+NZZ;NOZZLE;should
+OFP;Ovate;should
+PcG_EZ;CXC;should
+PcG_EZ;SET;should
+PcG_FIE;FIE_clipped_for_HMM;should
+PcG_FIE;WD40;should
+PcG_VEFS;VEFS-Box;should
+PcG_VEFS;zf-C2H2;should not
+PcG_MSI;WD40;should
+PcG_MSI;CAF1C_H4-bd;should
+PcG_MSI;FIE_clipped_for_HMM;should not
+PHD;Myb_DNA-binding;should not
+PHD;Alfin-like;should not
+PHD;ARID;should not
+PHD;DDT;should not
+PHD;Homeobox;should not
+PHD;JmjC;should not
+PHD;JmjN;should not
+PHD;PHD;should
+PHD;SWIB;should not
+PHD;zf-TAZ;should not
+PHD;zf-MIZ;should not
+PHD;zf-CCCH;should not
+PHD;HMG_box;should not
+PLATZ;PLATZ;should
+Pseudo ARR-B;CCT;should
+Pseudo ARR-B;Response_reg;should
+Pseudo ARR-B;tify;should not
+RB;RB_B;should
+Rcd1-like;Rcd1;should
+Rel;RHD_DNA_bind;should
+RF-X;RFX_DNA_binding;should
+RRN3;RRN3;should
+Runt;Runt;should
+S1Fa-like;S1FA;should
+SAP;STER_AP;should
+SBP;SBP;should
+SET;zf-C2H2;should not
+SET;TCR;should not
+SET;CXC;should not
+SET;PHD;should not
+SET;Myb_DNA-binding;should not
+SET;SET;should
+Sigma70-like;Sigma70_r2;should
+Sigma70-like;Sigma70_r3;should
+Sigma70-like;Sigma70_r4;should
+Sin3;PAH;should
+Sin3;WRKY;should not
+Sir2;SIR2;should
+SOH1;Med31;should
+SRS;DUF702;should
+SWI/SNF_BAF60b;SWIB;should
+SWI/SNF_SNF2;AP2;should not
+SWI/SNF_SNF2;PHD;should not
+SWI/SNF_SNF2;SNF2_N;should
+SWI/SNF_SNF2;zf-CCCH;should not
+SWI/SNF_SNF2;Myb_DNA-binding;should not
+SWI/SNF_SNF2;HMG_box;should not
+SWI/SNF_SWI3;SWIRM;should
+SWI/SNF_SWI3;Myb_DNA-binding;should
+TEA;TEA;should
+TFb2;Tfb2;should
+tify;tify;should
+TRAF;BTB;should
+TRAF;zf-TAZ;should not
+Trihelix;trihelix;should
+TUB;Tub;should
+ULT;ULT_Domain;should
+VARL;VARL;should
+VOZ;VOZ_Domain;should
+Whirly;Whirly;should
+WRKY;WRKY;should
+Zinc finger, AN1 and A20 type;zf-AN1;should
+Zinc finger, AN1 and A20 type;zf-C2H2;should not
+Zinc finger, MIZ type;zf-MIZ;should
+Zinc finger, MIZ type;zf-C2H2;should not
+Zinc finger, ZPR1;zf-ZPR1;should
+Zn_clus;Zn_clus;should
+ALOG;ALOG;should
+C2H2;C2H2-IDD;should not
+C2H2_IDD;C2H2-IDD;should
+C2H2_IDD;zf-C2H2;should
+MYST;zf-MYST;should
+CBP;CBP;should
+CBP;zf-TAZ;should
+CBP;BTB;should not
+TAFII250;DUF3591;should
+LOB1;bZIP_1;should not
+LOB1;bZIP_2;should not
+LOB1;DUF260;should
+LOB1;HLH;should not
+LOB1;Homeobox;should not
+LOB2;LOB2;should
+LDL/FLD;SWIRM;should
+LDL/FLD;Myb_DNA-binding;should not
+ADA2;zz-ADA2;should
+ADA2;Myb_DNA-binding;should
+RKD;RWP-RK;should
+RKD;NLP;should not
+NLP;RWP-RK;should
+NLP;NLP;should
+CRF;CRF;should
+CRF;AP2;should
+GIY_YIG;GIY_YIG;should
+ZPR;ZPR;should
+HD-LD;LD;should
+HD-NDX;NDX;should
+HD-SAWADEE;SAWADEE;should
+C1HDZ;C1HDZ;should
+C1HDZ;Homeobox;should
+C1HDZ;START;should not
+C1HDZ;MEKHLA;should not
+C2HDZ;C2HDZ;should
+C2HDZ;Homeobox;should
+C2HDZ;START;should not
+C2HDZ;MEKHLA;should not
--- a/tapscan_script_v74.pl	Wed Feb 14 13:54:16 2024 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,916 +0,0 @@
-#!/usr/bin/perl
-use strict;
-use warnings;
-
-# Written by Gerrit Timmerhaus (gerrit.timmerhaus@biologie.uni-freiburg.de).
-# Changes included by Kristian Ullrich, Per Wilhelmsson and Romy Petroll.
-
-# Script to extract all detected domains out of a hmmsearch results file and classify the families of all used proteins based on these domains.
-# The classification depends on a table which contains all known classification rules for the protein families of interest and on specific coverage values defined for every domain.
-# The script provides three outputs, namely output.1, output.2 and output.3. The output files are tables in ";"-delimited format.
-# The structure of output.1 is: "sequence ID ; TAP family ; number of classifications ; domains".
-# Output.3 shares in principle the same structure as output.1, except that subfamilies are considered. ("sequence ID ; TAP family ; Subfamily ; number of classifications ; domains")
-# The superior TAP family is specified first, followed by the subfamily. If a TAP family has no subfamily, the TAP family is specified first and then a "-".
-# The structure of output.2 is: "TAP family";"number of detected proteins".
-# More than one entry for a protein is possible because the classification rules may allow more than one classification.
-#
-# The script must be startet with the arguments <hmmsearch output file> <classification rules> <output classifications file> <output family statistics file> <output subfamily classifications file> <"filter" if desired>
-
-if (!@ARGV or ($ARGV [0] eq "-h") or ($ARGV [0] eq "-help")) {
-	print "Usage: extract.and.classify.pl <hmmsearch output file> <classification rules> <output classifications file> <output family statistics file> <output subfamily classifications file> <\"filter\" (if desired)>\n\n";
-	exit;
-}
-
-# hmmsearch_output: domtblout file
-my $hmmsearch_output = $ARGV [0];
-# decision_table: rules file
-my $decision_table = $ARGV [1];
-# family_classifications: output.1
-my $family_classifications = $ARGV [2];
-# family_statistics: output.2
-my $family_statistics = $ARGV [3];
-# subfamily_classifications: output.3
-my $subfamily_classifications = $ARGV [4];
-# domspec_cuts: coverage values file
-my $domspec_cuts = $ARGV [5];
-# gene_model_filte: filter for ARATH and ORYSA
-my $gene_model_filter = $ARGV [6];
-
-if ($family_statistics eq "") {
-	print "Usage: extract.and.classify.pl <hmmsearch output file> <classification rules> <output classifications file> <output family statistics file> <output subfamil classifications file> <\"filter\" (if desired)>\n\n";
-	exit;
-}
-
-if ($gene_model_filter and $gene_model_filter eq "filter") {
-	print "\nGene model filter is activated. It only works for TAIR (Arabidopsis) and TIGR (Rice) proteins up to now\n";
-}
-
-# Array where the $hmmsearch-output/domtblout file will be stored
-my @output = ();
-# Array with domain-specific coverage values
-my @cuts = ();
-# Array with rules
-my @dec_table = ();
-# Counter for the number of detected domains in the hmmsearch output file
-my $entry_counter = 0;
-# Containes the actual result for a query sequence
-my $akt_entry = "";
-# Used to define query entry to ignore similar domains
-my $whole_entry = "";
-# Includes the final entries after ignoring similar domains
-my @results_of_extraction = ();
-# Used to define query entry to ignore similar domains
-my $extracted_domain = "";
-# Used to define query entry to ignore similar domains
-my $present = "";
-# Used to define query entry to ignore similar domains
-my $protein = "";
-
-my $lek = "";
-
-############################################
-### 1. Read in the hmmsearch output file ###
-############################################
-
-print "\n*** reading in $hmmsearch_output ***\n\n";
-
-@output = get_file_data("$hmmsearch_output");
-
-print "*** Parsing $hmmsearch_output ***\n\n";
-
-# If wrong format exit the program, ninth row from the end
-if ($output [-9] !~ /^# Program:         hmmsearch.*/) {
-	print "wrong file format. $hmmsearch_output has to be a hmmsearch output file\n\n";
-	exit;
-}
-
-### 1.1 Trim input file (hmmsearch) to fit script. Erase lines starting with #.
-@output = grep !/^#.*/, @output;
-
-# Read domain-specific coverage values file
-# domname	cut-off
-# EIN3		0.5874125874
-
-open(FILENAME,$domspec_cuts);
-my %cuts = map { chomp; split /\t/ } <FILENAME>;
-
-### 1.2 Use first (prot name), third (domain name), eight (dom evalue), fourteenth (# overlapping) and fifteenth (# domain) column of data input(array element).
-foreach my $output (@output) {
-	$output =~ /^(\S+)\s+\S+\s+\S+\s+(\S+)\s+\S+\s+(\S+)\s+\S+\s+\S+\s+\S+\s+\S+\s+\S+\s+(\S+)\s+\S+\s+\S+\s+\S+\s+(\S+)\s+(\S+)\s+.*/;
-	$lek = (($6-$5)+1)/$3;
-	$output = $1."\t".$2."\t".$4."\t".$lek;
-	}
-
-# Stucture:
-# ARATHwo_AT2G03430.1	Ank	1.8e-08		0.93
-# ARATHwo_AT2G03430.1	Ank     7.8e-10		0.95
-# ARATHwo_AT2G03430.1	Ank     1.2e-08		0.93
-# ARATHwo_AT2G03430.1	Ank     1.2e-08		0.63
-
-################################
-### 2. Modify the input data ###
-################################
-
-### 2.1 Run through length_cut_off loop (throws out entries below coverage values)
-my @cutarray;
-foreach my $output (@output) {
-	$output =~ /^(\S+)\t+(\S+)\t(\S+)\t(\S+)/;
-	if ($4 > $cuts{$2}) {
-	push (@cutarray, $output);
-	}
-}
-
-# End up with (prot name) (domain name) (dom evalue) (overlapping)
-
-# Structure:
-# ARATHwo_AT2G03430.1	Ank	1.8e-08		0.93
-# ARATHwo_AT2G03430.1	Ank     7.8e-10		0.95
-# ARATHwo_AT2G03430.1	Ank     1.2e-08		0.93
-
-
-### 2.2 Retain hit with the lowest evalue
-
-# String for the previous protein name
-my $old_prot = "";
-# Coordinates for array filling
-my $x = -1;
-my $y = 1;
-my $scoreeval;
-my @newarray;
-foreach my $cutarray (@cutarray) {
-	$cutarray =~ /^(\S+)\t+(\S+)\t(\S+)\t(\S+)/;
-	$akt_entry = $1.$2;
-	# If the protein contains more than one of the same domain
-	if ($old_prot eq $akt_entry) {
-		if ($3>$scoreeval) {
-			$y++;
-			$newarray[$x] = $1."\t".$2."\t".$3."\t".$y;
-			}
-		else {
-			$y++;
-			$newarray[$x] = $1."\t".$2."\t".$scoreeval."\t".$y;
-			}
-		}
-	# If the protein is new reset $j and push the entry in the next array
-	else {
-		$y = 1;
-		$x++;
-		$newarray[$x] = $1."\t".$2."\t".$3."\t".$y;
-	}
-	$old_prot = $akt_entry;
-	$scoreeval = $3
-
-}
-# Final structure:
-# ARATHwo_AT2G03430.1	Ank     7.8e-10		3
-
-### 2.3 Then sort @output primarily on prot name($1) and then secondarily on eval($3) to fit the "similar"-procedure.
-@newarray = sort { (split '\t', $a)[0] cmp (split '\t', $b)[0] || (split '\t', $a)[2] <=> (split '\t', $b)[2] } @newarray;
-
-# Flags for merged domains
-my $ARR_B_counter = 0;
-my $bZIP_counter = 0;
-my $ET_counter = 0;
-
-# Flags for similar domain omission
-my $WD40domain = 0;		# modified for FIE rule
-my $FIEdomain = 0;		# --------'''''--------
-my $MYBdomain = 0;
-my $G2domain = 0;
-my $YBdomain = 0;
-my $YCdomain = 0;
-my $PHDdomain = 0;
-my $Alfindomain = 0;
-my $Dr1domain = 0;
-my $GATAdomain = 0;
-my $Dofdomain = 0;
-my $C1domain = 0;
-
-foreach my $line (@newarray) {
-
-# Reset the flags for domain omission
-	if ($line !~ /^$protein.*$/ ) {
-		$WD40domain = 0;
-		$FIEdomain = 0;
-		$MYBdomain = 0;
-		$G2domain = 0;
-		$YBdomain = 0;
-		$YCdomain = 0;
-		$PHDdomain = 0;
-		$Alfindomain = 0;
-		$Dr1domain = 0;
-		$GATAdomain = 0;
-		$Dofdomain = 0;
-		$C1domain = 0;
-
-	}
-
-	# Get the Query entry
-	$line =~ /^(\S+)\s+(\S+)\s+(\S+)\s+(\S+)/;
-	$protein = "$1";
-	$extracted_domain = $2;
-	$present = "$4"; # FIX! number for many Myb
-	$whole_entry = $1.";".$2.";".$3.";".$4;
-
-
-	# Ignore FIE if it is not better scored than WD40
-	if ($extracted_domain eq "WD40" and $FIEdomain == 0) {$WD40domain = 1;}
-
-	# Ignore similar domains part 1
-
-	if ($extracted_domain eq "Myb_DNA-binding" and $G2domain == 0) {$MYBdomain = 1;}
-	if ($extracted_domain eq "G2-like_Domain" and $MYBdomain == 0) {$G2domain = 1;}
-
-	if ($extracted_domain eq "PHD" and $Alfindomain == 0 and $C1domain == 0) {$PHDdomain = 1;}
-	if ($extracted_domain eq "Alfin-like" and $PHDdomain == 0 and $C1domain == 0) {$Alfindomain = 1;}
-	if ($extracted_domain eq "C1_2" and $Alfindomain == 0 and $PHDdomain == 0) {$C1domain = 1;}
-
-        if ($extracted_domain eq "GATA" and $Dofdomain == 0) {$GATAdomain = 1;}
-	if ($extracted_domain eq "zf-Dof" and $GATAdomain == 0) {$Dofdomain = 1;}
-
-	# Ignore similar domains part 2
-
-	if ($extracted_domain eq "FIE_clipped_for_HMM" and $WD40domain == 1) {next;}
-	if ($extracted_domain eq "Myb_DNA-binding" and $G2domain == 1) {next;}
-	if ($extracted_domain eq "G2-like_Domain" and $MYBdomain == 1) {next;}
-
-	if ($extracted_domain eq "PHD" and ($Alfindomain == 1 or $C1domain == 1)) {next;}
-	if ($extracted_domain eq "Alfin-like" and ($PHDdomain == 1 or $C1domain ==1)) {next;}
-	if ($extracted_domain eq "C1_2" and ($PHDdomain == 1 or $Alfindomain == 1)) {next;}
-
-       	if ($extracted_domain eq "GATA" and $Dofdomain == 1) {next;}
-	if ($extracted_domain eq "zf-Dof" and $GATAdomain == 1) {next;}
-
-	# Save the entry
-	push @results_of_extraction, $whole_entry;
-	$entry_counter++;
-}
-print "$entry_counter domain matches were found in $hmmsearch_output\n\n";
-
-# Sort the entries:
-my @sorted_results_of_extraction = @results_of_extraction;
-print "*** Preparing hmmsearch results for classification ***\n\n";
-
-# This array will be filled with the sorted proteins
-my $array_of_arrays = [];
-
-# String for the previous protein name
-my $old_entry = "";
-
-# Coordinates for array filling
-my $i = -1;
-my $j = 0;
-
-@dec_table = get_file_data("$decision_table");
-
-if ($dec_table [0] !~ /^[^;]+;[^;]+;[^;]/) {
-	print "wrong file format. $decision_table has to be in the format family;domain;type\n\n";
-	exit;
-}
-
-# Formate the entries
-foreach my $domain_entry (@sorted_results_of_extraction) {
-
-	$domain_entry =~ /^([^;]+);/;
-	$akt_entry = $1;
-	# If the protein has more than one domain push it behind the old entry
-	if ($old_entry eq $akt_entry) {
-		$j++;
-		$array_of_arrays->[$i][$j] = $domain_entry;
-	}
-
-	# If the protein is new reset $j and push the entry in the next array
-	else {
-		$j = 0;
-		$i++;
-		$array_of_arrays->[$i][$j] = $domain_entry;
-	}
-	$old_entry = $akt_entry;
-
-}
-$old_entry = "";
-
-# Remember the number of entries to use them later
-my $number_of_proteins = $i+1;
-print "$number_of_proteins proteins were found in $hmmsearch_output\n";
-$number_of_proteins--;
-
-#########################################
-### 3. Get the classification entries ###
-#########################################
-
-# Make a list of all families in the classification rules file
-my @liste_alle_familien = ('0_no_family_found');
-
-my $ARR_B_in_list = 0;
-my $bZIP_in_list = 0;
-my $ET_in_list = 0;
-
-my $array_of_classifications = [];
-$i = -1;
-$j = 0;
-
-my @classifications_input = get_file_data("$decision_table");
-
-# Sort the classifications
-my @classifications = sort @classifications_input;
-
-
-foreach my $classification (@classifications) {
-	$classification =~ /^([^;]+);/;
-	$akt_entry = $1;
-
-	# If the family has more than one domain entry push it behind the old entry
-	if ($old_entry eq $akt_entry) {
-
-		$j++;
-		$array_of_classifications->[$i][$j] = $classification;
-	}
-
-	# If the family is new push it in the next array and reset $j
-	else {
-		$j = 0;
-		$i++;
-		$array_of_classifications->[$i][$j] = $classification;
-		# Add family to list of all families in the classification rules
-		push(@liste_alle_familien,$akt_entry);
-
-		# Add hardcoded "OR" defined families to list of families if subfamily is present
-		# Do this just once for each family
-		if (($akt_entry eq "GARP_ARR-B_Myb") or ($akt_entry eq "GARP_ARR-B_G2")) {
-                	if ($ARR_B_in_list == 0) {
-				push(@liste_alle_familien,"GARP_ARR-B");
-				$ARR_B_in_list++;
-			}}
-		if (($akt_entry eq "bZIP1") or ($akt_entry eq "bZIP2") or ($akt_entry eq "bZIPAUREO") or ($akt_entry eq "bZIPCDD")) {
-                	if ($bZIP_in_list == 0) {
-				push(@liste_alle_familien,"bZIP");
-				$bZIP_in_list++;
-			}}
-		if (($akt_entry eq "HRT") or ($akt_entry eq "GIY_YIG")) {
-                	if ($ET_in_list == 0) {
-				push(@liste_alle_familien,"ET");
-				$ET_in_list++;
-			}}
-	}
-	$old_entry = $akt_entry;
-}
-
-my $number_of_classifications = $i+1;
-print "$number_of_classifications different families were found in $decision_table\n\n";
-$number_of_classifications--;
-
-print "*** begin classification ***\n\n";
-
-# Create the output files (output.1 and output.3)
-my $outputfile = "$family_classifications";
-my $subfamilyoutput = "$subfamily_classifications";
-
-unless (open(FAMILY_CLASSIFICATIONS, ">$outputfile")) {
-	print "Cannot open file \"$outputfile\" to write to!!\n\n";
-	exit;
-}
-
-unless (open(SUBFAMILY_CLASSIFICATIONS, ">$subfamilyoutput")) {
-	print "Cannot open file \"$subfamilyoutput\" to write to!!\n\n";
-	exit;
-}
-
-print FAMILY_CLASSIFICATIONS "family classifications for $hmmsearch_output\n";
-print SUBFAMILY_CLASSIFICATIONS "family classifications for $hmmsearch_output\n";
-
-# Counter for classified families
-my $classified_families = 0;
-# Counter for unclassified families
-my $unclassified_families = 0;
-# Counter for the different amount of entries in classification possibilities
-my @entry_counter = [];
-# Counter for the possible classifications for the current protein
-my $possibilities = 0;
-# For later famliy statistics
-my @family_list = [];
-# This array will be filled with the result. Later written in output.1 and output.3
-my @family_classifications_file = [];
-my @subfamily_classifications_file = [];
-
-# One time for every protein in array of arrays
-for (my $ii = 0; $ii <= $number_of_proteins; $ii++) {
-	my $jj = 0;
-	$possibilities = 0;
-
-    # Flag to mark that any family was found for the current protein
-	my $member_found = 0;
-
-	# In this string all the domains of one protein will be stored
-	# The ; signs are important to mark a single entry (for example to differ AP2 and TF_AP2)
-	my $domains = ";";
-
-	# Get the domains in all elements of the current protein array:
-	while ($array_of_arrays->[$ii][$jj]) {
-
-		$array_of_arrays->[$ii][$jj] =~ /([^;]+);[^;]+;(\d+)$/;
-		# For discrimination between MYB and MYB-related. If more than 1 MYB domains was found replaced "Myb_DNA-binding" with "two_or_more_Myb_DNA-binding"
-		# Same for LIM
-		if ($1 eq "Myb_DNA-binding" and $2 eq 2) {
-			$domains .= "MYB-2R;Myb_DNA-binding;";
-		}
-		elsif ($1 eq "Myb_DNA-binding" and $2 eq 3) {
-			$domains .= "MYB-3R;Myb_DNA-binding;";
-		}
-		elsif ($1 eq "Myb_DNA-binding" and $2 eq 4) {
-			$domains .= "MYB-4R;Myb_DNA-binding;";
-		}
-		elsif ($1 eq "LIM" and $2 > 1) {
-			$domains .= "two_or_more_LIM;LIM;";
-		}
-		else {
-			$domains .= "$1;";
-		}
-		$jj++;
-	}
-
-#########################
-### 4. Classification ###
-#########################
-
-# Get the name of the actual protein for the output after classification:
-$array_of_arrays->[$ii][0] =~ /^([^;]+);/;
-$akt_entry = $1;
-
-# Variables for double classification decisions:
-
-# Here the shoulds for the actual family will be stored
-my $shoulds_act_family = ";";
-# Here the shoulds of the previous CLASSIFIED family will be stored
-my $shoulds_prev_family = "";
-
-# This loop will be made for every family entry in output.1 and output.3
-for ($i = 0;$i <= $number_of_classifications; $i++) {
-	$j = 0;
-
-	# Flag to mark if the protein is a possible member of the current family
-	my $possible_member = 0;
-
-	# Reset the domain entries for the current family
-        $shoulds_act_family = ";";
-
-        # This loop will be made for every domain classification entry for the current family
-	while ($array_of_classifications->[$i][$j]) {
-
-		# For "should" entries
-		if ($array_of_classifications->[$i][$j] =~ /;should$/) {
-
-			# Get the domain from the classification entry
-			$array_of_classifications->[$i][$j] =~ /^[^;]+;([^;]+);[^;]+/;
-
-			my $domain_classification_entry = $1;
-			# Check if the protein might be a member of the current family. If yes check the next entry
-			if ($domains =~ /;$domain_classification_entry;/) {
-				$possible_member = 1;
-				$j++;
-				# All matched domains for the current family are stored here:
-				$shoulds_act_family .= "$domain_classification_entry;";
-				next;
-			}
-			$possible_member = 0;
-			$j++;
-			last;
-		}
-
-		# For "should not" entries
-		if ($array_of_classifications->[$i][$j] =~ /;should not$/) {
-
-			# Get the domain from the classification entry
-			$array_of_classifications->[$i][$j] =~ /^[^;]+;([^;]+);[^;]+/;
-
-			# Check if the protein has the domain. If yes go to the next family
-			my $domain_check = $1;
-			if ($domains =~ /;$domain_check;/) {
-					$possible_member = 0;
-					last;
-			}
-			# If not check the next entry
-			else {
-					$j++;
-			    		next;
-			}
-		}
-
-			# This happens when there is no "should" or "should not" entry at the right position
-			print "error in classification entry $array_of_classifications->[$i][$j]\n"
-	}
-
-	# If the check was successful print the found classification
-	if ($possible_member) {
-
-		$array_of_classifications->[$i][0] =~ /^([^;]+);/;
-		my $currentFamily = $1;
-		$possibilities++;
-		# The double classified proteins solution:
-		if ($possibilities > 1) {
-
-			# Fill an array with the current domains in $domains
-			my @domains_array = [];
-			my $number_of_domains = ($domains =~ tr/;/;/)-1;
-			my $tempdomains = $domains;
-			for (my $domain_counter = 0;$domain_counter != $number_of_domains;$domain_counter++) {
-
-				$tempdomains =~ /^;([^;]+);/;
-				$domains_array[$domain_counter] = $1;
-				# Remove the inserted domain from domains
-				$tempdomains =~ s/^;[^;]+;/;/;
-			}
-			# Compare the two possible families (actual and previous family)
-			my $array_counter = 0;
-			while ($array_counter < $number_of_domains) {
-				if ($shoulds_prev_family =~ /$domains_array[$array_counter]/) {
-					if ($shoulds_act_family =~ /$domains_array[$array_counter]/) {
-
-						# If actual domain is in previous and actual family go to next domain
-						$array_counter++;
-						next;
-
-					}
-					# Protein was right classified in the previous classification. The actual entry will be ignored
-
-					$possibilities--;
-					last;
-				}
-
-				# If actual domain is in the actual family but not in the previous. The actual classification is right
-				if ($shoulds_act_family =~ /$domains_array[$array_counter]/) {
-
-					pop @family_classifications_file;
-					pop @subfamily_classifications_file;
-
-					# Merge "GARBP_ARR-B_Myb" and "GARP_ARR-B_G2" to "GARP_ARR-B"
-					if ( ($currentFamily eq "GARP_ARR-B_Myb") or ($currentFamily eq "GARP_ARR-B_G2")){
-
-						push @family_classifications_file, "$akt_entry;GARP_ARR-B;$possibilities$domains\n";
-						push @subfamily_classifications_file, "$akt_entry;GARP_ARR-B;-;$possibilities$domains\n";
-						if ($ARR_B_counter == 0) {
-							push @liste_alle_familien, 'GARP_ARR-B';
-							$ARR_B_counter++;
-						}
-					}
-					elsif ( ($currentFamily eq "bZIP1") or ($currentFamily eq "bZIP2") or ($currentFamily eq "bZIPAUREO") or ($currentFamily eq "bZIPCDD")) {
-
-						push @family_classifications_file, "$akt_entry;bZIP;$possibilities$domains\n";
-						push @subfamily_classifications_file, "$akt_entry;bZIP;-;$possibilities$domains\n";
-						if ($bZIP_counter == 0) {
-							push @liste_alle_familien, 'bZIP';
-							$bZIP_counter++;
-						}
-					}
-					elsif ( ($currentFamily eq "HRT") or ($currentFamily eq "GIY_YIG")) {
-						push @subfamily_classifications_file, "$akt_entry;ET;-;$possibilities$domains\n";
-						push @family_classifications_file, "$akt_entry;ET;$possibilities$domains\n";
-						if ($ET_counter == 0) {
-							push @liste_alle_familien, 'ET';
-							$ET_counter++;
-						}
-					}
-					else {
-
-						push @family_classifications_file, "$akt_entry;$currentFamily;$possibilities$domains\n";
-						push @subfamily_classifications_file, "$akt_entry;$currentFamily;-;$possibilities$domains\n";
-					}
-					$possibilities--;
-					last;
-
-				}
-
-					$array_counter++;
-			}
-
-		}
-		# Normal enty: if the family is classified for one family write it in the array
-		else {
-
-			# Store the matched domains of the actual family for respectively later doubly classification decision
-			$shoulds_prev_family = $shoulds_act_family;
-			# Store the entry in the output array
-			# Merge "GARBP_ARR-B_Myb" and "GARP_ARR-B_G2" to "GARP_ARR-B"
-			# Add GARP_ARR-B to @liste_alle_familien ONCE
-
-				if ( ($currentFamily eq "GARP_ARR-B_Myb") or ($currentFamily eq "GARP_ARR-B_G2")){
-					push @family_classifications_file, "$akt_entry;GARP_ARR-B;$possibilities$domains\n";
-					push @subfamily_classifications_file, "$akt_entry;GARP_ARR-B;-;$possibilities$domains\n";
-					if ($ARR_B_counter == 0) {
-						push @liste_alle_familien, 'GARP_ARR-B';
-						$ARR_B_counter++;
-					}
-				}
-				elsif (($currentFamily eq "bZIP1") or ($currentFamily eq "bZIP2") or ($currentFamily eq "bZIPAUREO") or ($currentFamily eq "bZIPCDD")){
-					push @family_classifications_file, "$akt_entry;bZIP;$possibilities$domains\n";
-					push @subfamily_classifications_file, "$akt_entry;bZIP;-;$possibilities$domains\n";
-					if ($bZIP_counter == 0) {
-						push @liste_alle_familien, 'bZIP';
-						$bZIP_counter++;
-					}
-				}
-				elsif (($currentFamily eq "HRT") or ($currentFamily eq "GIY_YIG")){
-					push @family_classifications_file, "$akt_entry;ET;$possibilities$domains\n";
-					push @subfamily_classifications_file, "$akt_entry;ET;-;$possibilities$domains\n";
-					if ($ET_counter == 0) {
-						push @liste_alle_familien, 'ET';
-						$ET_counter++;
-					}
-				}
-				else {
-
-				push @family_classifications_file, "$akt_entry;$currentFamily;$possibilities$domains\n";
-				push @subfamily_classifications_file, "$akt_entry;$currentFamily;-;$possibilities$domains\n";
-				}
-
-                	$classified_families++;
-			}
-
-		# Flag to mark that any family was found for the current protein
-		$member_found = 1;
-		}
-
-	}
-
-	if ($possibilities == 0) {
-
-		# If no family was found for the current protein give this out
-		push @family_classifications_file, "$akt_entry;0_no_family_found;0$domains\n";
-		push @subfamily_classifications_file, "$akt_entry;0_no_family_found;-;0$domains\n";
-		$unclassified_families++;
-	}
-
-	# Increment the number of possible members in the array at the right position
-	else {
-
-		my $temp = $entry_counter[$possibilities];
-		$temp++;
-		$entry_counter[$possibilities] = $temp;
-	}
-
-}
-# Define for which TAP families also subfamilies may be present:
-
-foreach my $subfamily_classifications_file (@subfamily_classifications_file) {
-
-	# If "C2H2" was assigned to a sequence, write "C2H2;C2H2" to output.3 as TAP family and subfamily, respectively.
-	if ($subfamily_classifications_file =~ /C2H2/ ) {
-		$subfamily_classifications_file =~ s/C2H2;-/C2H2;C2H2/
-	}
-	# If "C2H2_IDD" was assigned to a sequence, write "C2H2;C2H2_IDD" to output.3 as TAP family and subfamily, respectively.
-	if ($subfamily_classifications_file =~ /C2H2_IDD/ ) {
-		$subfamily_classifications_file =~ s/C2H2_IDD;-/C2H2;C2H2_IDD/
-	}
-	if ($subfamily_classifications_file =~ /bHLH/ ) {
-		$subfamily_classifications_file =~ s/bHLH;-/bHLH;bHLH/
-	}
-	if ($subfamily_classifications_file =~ /bHLH_TCP/ ) {
-		$subfamily_classifications_file =~ s/bHLH_TCP;-/bHLH;bHLH_TCP/
-	}
-	if ($subfamily_classifications_file =~ /NF-YA/ ) {
-		$subfamily_classifications_file =~ s/NF-YA;-/NFY;NF-YA/
-	}
-	if ($subfamily_classifications_file =~ /NF-YB/ ) {
-		$subfamily_classifications_file =~ s/NF-YB;-/NFY;NF-YB/
-	}
-	if ($subfamily_classifications_file =~ /NF-YC/ ) {
-		$subfamily_classifications_file =~ s/NF-YC;-/NFY;NF-YC/
-	}
-	if ($subfamily_classifications_file =~ /C1HDZ/ ) {
-		$subfamily_classifications_file =~ s/C1HDZ;-/HDZ;C1HDZ/
-	}
-	if ($subfamily_classifications_file =~ /C2HDZ/ ) {
-		$subfamily_classifications_file =~ s/C2HDZ;-/HDZ;C2HDZ/
-	}
-	if ($subfamily_classifications_file =~ /C3HDZ/ ) {
-		$subfamily_classifications_file =~ s/C3HDZ;-/HDZ;C3HDZ/
-	}
-	if ($subfamily_classifications_file =~ /C4HDZ/ ) {
-		$subfamily_classifications_file =~ s/C4HDZ;-/HDZ;C4HDZ/
-	}
-	if ($subfamily_classifications_file =~ /MYST/ ) {
-		$subfamily_classifications_file =~ s/MYST;-/HAT;MYST/
-	}
-	if ($subfamily_classifications_file =~ /CBP/ ) {
-		$subfamily_classifications_file =~ s/CBP;-/HAT;CBP/
-	}
-	if ($subfamily_classifications_file =~ /TAFII250/ ) {
-		$subfamily_classifications_file =~ s/TAFII250;-/HAT;TAFII250/
-	}
-	if ($subfamily_classifications_file =~ /GNAT/ ) {
-		$subfamily_classifications_file =~ s/GNAT;-/HAT;GNAT/
-	}
-	if ($subfamily_classifications_file =~ /LOB1/ ) {
-		$subfamily_classifications_file =~ s/LOB1;-/LBD;LOB1/
-	}
-	if ($subfamily_classifications_file =~ /LOB2/ ) {
-		$subfamily_classifications_file =~ s/LOB2;-/LBD;LOB2/
-	}
-	if ($subfamily_classifications_file =~ /MYB-related/ ) {
-		$subfamily_classifications_file =~ s/MYB-related;-/MYB;MYB-related/
-	}
-	if ($subfamily_classifications_file =~ /SWI\/SNF_SWI3/ ) {
-		$subfamily_classifications_file =~ s/SWI\/SNF_SWI3;-/MYB-related;SWI\/SNF_SWI3/
-	}
-	if ($subfamily_classifications_file =~ /MYB-2R/ ) {
-		$subfamily_classifications_file =~ s/MYB-2R;-/MYB;MYB-2R/
-	}
-	if ($subfamily_classifications_file =~ /MYB-3R/ ) {
-		$subfamily_classifications_file =~ s/MYB-3R;-/MYB;MYB-3R/
-	}
-	if ($subfamily_classifications_file =~ /MYB-4R/ ) {
-		$subfamily_classifications_file =~ s/MYB-4R;-/MYB;MYB-4R/
-	}
-	if ($subfamily_classifications_file =~ /RKD/ ) {
-		$subfamily_classifications_file =~ s/RKD;-/RWP-RK;RKD/
-	}
-	if ($subfamily_classifications_file =~ /NLP/ ) {
-		$subfamily_classifications_file =~ s/NLP;-/RWP-RK;NLP/
-	}
-	if ($subfamily_classifications_file =~ /AP2/ ) {
-		$subfamily_classifications_file =~ s/AP2;-/AP2;AP2/
-	}
-	if ($subfamily_classifications_file =~ /CRF/ ) {
-		$subfamily_classifications_file =~ s/CRF;-/AP2;CRF/
-	}
-}
-
-### 4.1 Filter for gene models
-
-# Filter for gene models. All alternative splice variants will be removed from Arabidopsis and rice entries
-shift @family_classifications_file;
-shift @subfamily_classifications_file;
-# Sort the array for filter and a sorted output
-@family_classifications_file = sort @family_classifications_file;
-@subfamily_classifications_file = sort @subfamily_classifications_file;
-if ($gene_model_filter and $gene_model_filter eq "filter") {
-	print "*** filtering gene models: removing splice variants ***\n";
-
-	# temporary array for the filtered entries
-	my @filtered_entries = [];
-	my $models_removed_counter = 0;
-
-	my $prev_entry = "";
-	foreach my $entry_line (@family_classifications_file) {
-		if ($entry_line !~ /^([^.]+).\d+/) {
-			print "Bad format for filtering splice variants. Use \"filter\" only for TAIR (Aradopsis) or TIGR (Rice) files.\n\n";
-			exit;
-		}
-		if ($1 eq $prev_entry) {
-			$models_removed_counter++;
-			next;
-		}
-		$prev_entry = $1;
-		push @filtered_entries, $entry_line;
-	}
-	print " -> $models_removed_counter gene models were removed\n\n";
-	@family_classifications_file = @filtered_entries;
-	shift @family_classifications_file;
-}
-
-# Print the results in the array to output.1 and output.3 and push the names of the TAP families and Subfamilies into $family_list to create output.2
-foreach my $fcf_line (@family_classifications_file) {
-	$fcf_line =~ /^[^;]+;([^;]+)/;
-	#push @family_list, "$1";
-	print FAMILY_CLASSIFICATIONS "$fcf_line";
-}
-close (FAMILY_CLASSIFICATIONS);
-
-foreach my $fcf_line (@subfamily_classifications_file) {
-	$fcf_line =~ /^[^;]+;([^;]+);([^;]+)/;
-	if ($2 eq "-") {
-		push @family_list, "$1";
-	}
-	else {
-		push @family_list, "$2";
-	}
-	print SUBFAMILY_CLASSIFICATIONS "$fcf_line";
-}
-
-close (SUBFAMILY_CLASSIFICATIONS);
-
-print "*** calculating the family statistics and write it in $family_statistics ***\n\n";
-
-##################################
-### 5. Create the output files ###
-##################################
-
-my $statistics_outputfile = "$family_statistics";
-
-unless (open(FAMILY_STATISTICS, ">$statistics_outputfile")) {
-	print "Cannot open file \"$statistics_outputfile\" to write to!!\n\n";
-	exit;
-}
-
-# Count the family entries
-my @output_family_statistics = ();
-my @gefundene_familien = ();
-my $family_counter = 1;
-
-shift @family_list;
-@family_list = sort @family_list;
-
-
-my $old_family = "";
-push @family_list, 'BAD FIX'; # Makes to loop go through every fam and stops at non fam(BAD FIX).
-
-foreach my $line (@family_list) {
-	if ($line eq $old_family) {
-		$family_counter++;
-	}
-	elsif ($old_family ne "") {
-		push (@output_family_statistics,"$old_family;$family_counter\n");
-		# Add all found families to a list of found families
-		push (@gefundene_familien,"$old_family");
-	        $family_counter=1;
-	}
-	$old_family = $line;
-}
-
-my %hash = ();
-
-# Put all families from the classifictaion ruled and all found families in a hash
-foreach my $element (@gefundene_familien,@liste_alle_familien) {$hash{$element}++;}
-
-# Remove merged families
-delete $hash{'GARP_ARR-B_Myb'};
-delete $hash{'GARP_ARR-B_G2'};
-delete $hash{'bZIP1'};
-delete $hash{'bZIP2'};
-delete $hash{'bZIPAUREO'};
-delete $hash{'bZIPCDD'};
-delete $hash{'HRT'};
-delete $hash{'GIY_YIG'};
-
-# Add all not found families from the classification rules to @output_family_statistics
-# With zero as number of families found
-
-foreach my $element (keys %hash) {
-	if (($hash{$element} == 1) and ( $element eq "0_no_family_found")) {
-	push (@output_family_statistics,"$element;$unclassified_families\n");
-	}
-	if (($hash{$element} == 1) and ($element ne "0_no_family_found")) {
-	push (@output_family_statistics,"$element;0\n");
-	}
-}
-
-# Sort @output_family_statistics caseinsensitive-alphabetically
-my @sortierte_statistik = sort {lc $a cmp lc $b} @output_family_statistics;
-
-# Print headline to @sortierte_statistik
-unshift (@sortierte_statistik,"family statistics for $hmmsearch_output\n");
-
-print FAMILY_STATISTICS @sortierte_statistik;
-
-# Print FAMILY_STATISTICS "$old_family;$family_counter\n";
-
-close (FAMILY_STATISTICS);
-
-#################################################
-### 6. Give out some statistical informations ###
-#################################################
-
-$entry_counter [0] = 0;
-my $sum = 0;
-foreach my $entry (@entry_counter) {
-	$sum += $entry;
-}
-print "$classified_families classifications were found for $sum proteins.\n";
-print "This classifications are divided in:\n";
-my $count = 0;
-foreach my $element (@entry_counter) {
-	if ($count != 0) {
-		print "$element proteins were classified for $count";
-		if ($count == 1) {print " family\n";}
-		else {print " different families\n";}
-	}
-	$count++;
-}
-print "\n$unclassified_families proteins could not be classified\n\n";
-
-print "*** The results were written in $family_classifications and $subfamily_classifications ***\n";
-print "*** done ***\n\n";
-
-exit;
-
-sub get_file_data {
-
-	my ($filename) = @_;
-
-	use strict;
-	use warnings;
-
-	my @filedata = ();
-
-	unless( open(GET_FILE_DATA, $filename)) {
-		print STDERR "Cannot open file \"$filename\"n\n";
-		exit;
-	}
-
-	@filedata = <GET_FILE_DATA>;
-
-	close GET_FILE_DATA;
-
-	return @filedata;
-}
-
--- a/test-data/output.2.tsv	Wed Feb 14 13:54:16 2024 +0000
+++ b/test-data/output.2.tsv	Thu Feb 22 10:07:53 2024 +0000
@@ -57,13 +57,14 @@
 HD-NDX	0
 HD-other	0
 HD-SAWADEE	0
-HD_BEL	0
 HD_DDT	0
-HD_KNOX1	0
-HD_KNOX2	0
 HD_PHD	0
 HD_PINTOX	0
 HD_PLINC	0
+HD_TALE	0
+HD_TALE_BEL	0
+HD_TALE_KNOX1	0
+HD_TALE_KNOX2	0
 HD_WOX	0
 HMG	0
 HSF	0
--- a/test-data/output.domtbl.tsv	Wed Feb 14 13:54:16 2024 +0000
+++ b/test-data/output.domtbl.tsv	Thu Feb 22 10:07:53 2024 +0000
@@ -20,9 +20,9 @@
 # Program:         hmmsearch
 # Version:         3.3.2 (Nov 2020)
 # Pipeline mode:   SEARCH
-# Query file:      /home/saskia/code/github/galaxyproject/tools-iuc/tools/tapscan/tapscan_domains_v12.txt
-# Target file:     /tmp/saskia/tmp4n93kzh1/files/b/7/b/dataset_b7b39fd6-41f1-440d-bfb7-da7a3a9f070e.dat
-# Option settings: hmmsearch --domtblout domtblout.txt --cut_ga /home/saskia/code/github/galaxyproject/tools-iuc/tools/tapscan/tapscan_domains_v12.txt /tmp/saskia/tmp4n93kzh1/files/b/7/b/dataset_b7b39fd6-41f1-440d-bfb7-da7a3a9f070e.dat
-# Current dir:     /tmp/saskia/tmp4n93kzh1/job_working_directory/000/4/working
-# Date:            Mon Nov 13 14:57:28 2023
+# Query file:      /home/saskia/code/github/bgruening/galaxytools/tools/tapscan/tapscan_domains_v13.txt.gz
+# Target file:     /tmp/saskia/tmpwta2cyq_/files/f/5/c/dataset_f5c238a3-8cc8-42f9-86b5-ebc2431be570.dat
+# Option settings: hmmsearch --domtblout domtblout.txt --cut_ga /home/saskia/code/github/bgruening/galaxytools/tools/tapscan/tapscan_domains_v13.txt.gz /tmp/saskia/tmpwta2cyq_/files/f/5/c/dataset_f5c238a3-8cc8-42f9-86b5-ebc2431be570.dat
+# Current dir:     /tmp/saskia/tmpwta2cyq_/job_working_directory/000/4/working
+# Date:            Thu Feb 22 10:10:40 2024
 # [ok]