Mercurial > repos > bgruening > tapscan
changeset 1:c4f865bd101a draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/tapscan commit af8605d266717ed3453bd2a0947fed79c7098fb3
author | bgruening |
---|---|
date | Thu, 22 Feb 2024 10:07:53 +0000 |
parents | 196795831b6a |
children | |
files | tapscan.xml tapscan_classify.pl tapscan_coverage_values_v10.txt tapscan_coverage_values_v11.txt tapscan_domains_v12.txt.gz tapscan_domains_v13.txt.gz tapscan_rules_v81.txt tapscan_rules_v82.txt tapscan_script_v74.pl test-data/output.2.tsv test-data/output.domtbl.tsv |
diffstat | 11 files changed, 1407 insertions(+), 1385 deletions(-) [+] |
line wrap: on
line diff
--- a/tapscan.xml Wed Feb 14 13:54:16 2024 +0000 +++ b/tapscan.xml Thu Feb 22 10:07:53 2024 +0000 @@ -1,4 +1,4 @@ -<tool id="tapscan_classify" name="TAPScan Classify" version="4.74+galaxy0" profile="23.0"> +<tool id="tapscan_classify" name="TAPScan Classify" version="4.76+galaxy0" profile="23.0"> <description>Detect Transcription Associated Proteins (TAPs)</description> <edam_topics> <edam_topic>topic_0121</edam_topic> @@ -9,28 +9,28 @@ <requirement type="package" version="4.8">sed</requirement> </requirements> <required_files> - <include type="literal" path="tapscan_script_v74.pl"/> - <include type="literal" path="tapscan_domains_v12.txt"/> - <include type="literal" path="tapscan_rules_v81.txt"/> - <include type="literal" path="tapscan_coverage_values_v10.txt"/> + <include type="literal" path="tapscan_classify.pl"/> + <include type="literal" path="tapscan_domains_v13.txt.gz"/> + <include type="literal" path="tapscan_rules_v82.txt"/> + <include type="literal" path="tapscan_coverage_values_v11.txt"/> </required_files> <command detect_errors="aggressive"><![CDATA[ hmmsearch --domtblout domtblout.txt --cut_ga - '${__tool_directory__}/tapscan_domains_v12.txt.gz' + '${__tool_directory__}/tapscan_domains_v13.txt.gz' '$protein_fasta_in' && -perl '${__tool_directory__}/tapscan_script_v74.pl' +perl '${__tool_directory__}/tapscan_classify.pl' domtblout.txt - '${__tool_directory__}/tapscan_rules_v81.txt' + '${__tool_directory__}/tapscan_rules_v82.txt' '$taps_detected' '$taps_family_counts' '$taps_detected_extra' - '${__tool_directory__}/tapscan_coverage_values_v10.txt' + '${__tool_directory__}/tapscan_coverage_values_v11.txt' &&
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tapscan_classify.pl Thu Feb 22 10:07:53 2024 +0000 @@ -0,0 +1,932 @@ +#!/usr/bin/perl +use strict; +use warnings; +use File::Basename; + +my $tapscan_version = "v4.76"; +print "Running TAPscan Classify version $tapscan_version \n\n"; + + +# Written by Gerrit Timmerhaus (gerrit.timmerhaus@biologie.uni-freiburg.de). +# Changes included by Kristian Ullrich, Per Wilhelmsson, Romy Petroll and Saskia Hiltemann. + +# Script to extract all detected domains out of a hmmsearch results file and classify the families of all used proteins based on these domains. +# The classification depends on a table which contains all known classification rules for the protein families of interest and on specific coverage values defined for every domain. +# The script provides three outputs, namely output.1, output.2 and output.3. The output files are tables in ";"-delimited format. +# The structure of output.1 is: "sequence ID ; TAP family ; number of classifications ; domains". +# Output.3 shares in principle the same structure as output.1, except that subfamilies are considered. ("sequence ID ; TAP family ; Subfamily ; number of classifications ; domains") +# The superior TAP family is specified first, followed by the subfamily. If a TAP family has no subfamily, the TAP family is specified first and then a "-". +# The structure of output.2 is: "TAP family";"number of detected proteins". +# More than one entry for a protein is possible because the classification rules may allow more than one classification. +# +# The script must be startet with the arguments <hmmsearch output file> <classification rules> <output classifications file> <output family statistics file> <output subfamily classifications file> <"filter" if desired> + +if (!@ARGV or ($ARGV [0] eq "-h") or ($ARGV [0] eq "-help")) { + print "Usage: extract.and.classify.pl <hmmsearch output file> <classification rules> <output classifications file> <output family statistics file> <output subfamily classifications file> <\"filter\" (if desired)>\n\n"; + exit; +} + +# hmmsearch_output: domtblout file +my $hmmsearch_output = $ARGV [0]; +# decision_table: rules file +my $decision_table = $ARGV [1]; +# family_classifications: output.1 +my $family_classifications = $ARGV [2]; +# family_statistics: output.2 +my $family_statistics = $ARGV [3]; +# subfamily_classifications: output.3 +my $subfamily_classifications = $ARGV [4]; +# domspec_cuts: coverage values file +my $domspec_cuts = $ARGV [5]; +# gene_model_filte: filter for ARATH and ORYSA +my $gene_model_filter = $ARGV [6]; + +# get basename for output files +my($basename, $dirs, $suffix) = fileparse($hmmsearch_output, qr/\.[^.]*/); + +if ($family_statistics eq "") { + print "Usage: extract.and.classify.pl <hmmsearch output file> <classification rules> <output classifications file> <output family statistics file> <output subfamil classifications file> <\"filter\" (if desired)>\n\n"; + exit; +} + +if ($gene_model_filter and $gene_model_filter eq "filter") { + print "\nGene model filter is activated. It only works for TAIR (Arabidopsis) and TIGR (Rice) proteins up to now\n"; +} + +# Array where the $hmmsearch-output/domtblout file will be stored +my @output = (); +# Array with domain-specific coverage values +my @cuts = (); +# Array with rules +my @dec_table = (); +# Counter for the number of detected domains in the hmmsearch output file +my $entry_counter = 0; +# Containes the actual result for a query sequence +my $akt_entry = ""; +# Used to define query entry to ignore similar domains +my $whole_entry = ""; +# Includes the final entries after ignoring similar domains +my @results_of_extraction = (); +# Used to define query entry to ignore similar domains +my $extracted_domain = ""; +# Used to define query entry to ignore similar domains +my $present = ""; +# Used to define query entry to ignore similar domains +my $protein = ""; + +my $lek = ""; + +############################################ +### 1. Read in the hmmsearch output file ### +############################################ + +print "\n*** reading in $hmmsearch_output ***\n\n"; + +@output = get_file_data("$hmmsearch_output"); + +print "*** Parsing $hmmsearch_output ***\n\n"; + +# If wrong format exit the program, ninth row from the end +if ($output [-9] !~ /^# Program: hmmsearch.*/) { + print "wrong file format. $hmmsearch_output has to be a hmmsearch output file\n\n"; + exit; +} + +### 1.1 Trim input file (hmmsearch) to fit script. Erase lines starting with #. +@output = grep !/^#.*/, @output; + +# Read domain-specific coverage values file +# domname cut-off +# EIN3 0.5874125874 + +open(FILENAME,$domspec_cuts); +my %cuts = map { chomp; split /\t/ } <FILENAME>; + +### 1.2 Use first (prot name), third (domain name), eight (dom evalue), fourteenth (# overlapping) and fifteenth (# domain) column of data input(array element). +foreach my $output (@output) { + $output =~ /^(\S+)\s+\S+\s+\S+\s+(\S+)\s+\S+\s+(\S+)\s+\S+\s+\S+\s+\S+\s+\S+\s+\S+\s+(\S+)\s+\S+\s+\S+\s+\S+\s+(\S+)\s+(\S+)\s+.*/; + $lek = (($6-$5)+1)/$3; + $output = $1."\t".$2."\t".$4."\t".$lek; + } + +# Stucture: +# ARATHwo_AT2G03430.1 Ank 1.8e-08 0.93 +# ARATHwo_AT2G03430.1 Ank 7.8e-10 0.95 +# ARATHwo_AT2G03430.1 Ank 1.2e-08 0.93 +# ARATHwo_AT2G03430.1 Ank 1.2e-08 0.63 + +################################ +### 2. Modify the input data ### +################################ + +### 2.1 Run through length_cut_off loop (throws out entries below coverage values) +my @cutarray; +open(FH, '>', "${basename}_filtered_sequences.txt") or die $!; +print FH "Below are the sequences that were filtered out, along with the reason"; +foreach my $output (@output) { + $output =~ /^(\S+)\t+(\S+)\t(\S+)\t(\S+)/; + if ($4 > $cuts{$2}) { + push (@cutarray, $output); + } + else{ + print FH "$output --> did not meet coverage cutoff of $cuts{$2} (cov was $4)\n" + } +} +close(FH); + +# End up with (prot name) (domain name) (dom evalue) (overlapping) + +# Structure: +# ARATHwo_AT2G03430.1 Ank 1.8e-08 0.93 +# ARATHwo_AT2G03430.1 Ank 7.8e-10 0.95 +# ARATHwo_AT2G03430.1 Ank 1.2e-08 0.93 + + +### 2.2 Retain hit with the lowest evalue + +# String for the previous protein name +my $old_prot = ""; +# Coordinates for array filling +my $x = -1; +my $y = 1; +my $scoreeval; +my @newarray; +foreach my $cutarray (@cutarray) { + $cutarray =~ /^(\S+)\t+(\S+)\t(\S+)\t(\S+)/; + $akt_entry = $1.$2; + # If the protein contains more than one of the same domain + if ($old_prot eq $akt_entry) { + if ($3>$scoreeval) { + $y++; + $newarray[$x] = $1."\t".$2."\t".$3."\t".$y; + } + else { + $y++; + $newarray[$x] = $1."\t".$2."\t".$scoreeval."\t".$y; + } + } + # If the protein is new reset $j and push the entry in the next array + else { + $y = 1; + $x++; + $newarray[$x] = $1."\t".$2."\t".$3."\t".$y; + } + $old_prot = $akt_entry; + $scoreeval = $3 + +} +# Final structure: +# ARATHwo_AT2G03430.1 Ank 7.8e-10 3 + +### 2.3 Then sort @output primarily on prot name($1) and then secondarily on eval($3) to fit the "similar"-procedure. +@newarray = sort { (split '\t', $a)[0] cmp (split '\t', $b)[0] || (split '\t', $a)[2] <=> (split '\t', $b)[2] } @newarray; + +# Flags for merged domains +my $ARR_B_counter = 0; +my $bZIP_counter = 0; +my $ET_counter = 0; + +# Flags for similar domain omission +my $WD40domain = 0; # modified for FIE rule +my $FIEdomain = 0; # --------'''''-------- +my $MYBdomain = 0; +my $G2domain = 0; +my $YBdomain = 0; +my $YCdomain = 0; +my $PHDdomain = 0; +my $Alfindomain = 0; +my $Dr1domain = 0; +my $GATAdomain = 0; +my $Dofdomain = 0; +my $C1domain = 0; + +foreach my $line (@newarray) { + +# Reset the flags for domain omission + if ($line !~ /^$protein.*$/ ) { + $WD40domain = 0; + $FIEdomain = 0; + $MYBdomain = 0; + $G2domain = 0; + $YBdomain = 0; + $YCdomain = 0; + $PHDdomain = 0; + $Alfindomain = 0; + $Dr1domain = 0; + $GATAdomain = 0; + $Dofdomain = 0; + $C1domain = 0; + + } + + # Get the Query entry + $line =~ /^(\S+)\s+(\S+)\s+(\S+)\s+(\S+)/; + $protein = "$1"; + $extracted_domain = $2; + $present = "$4"; # FIX! number for many Myb + $whole_entry = $1.";".$2.";".$3.";".$4; + + + # Ignore FIE if it is not better scored than WD40 + if ($extracted_domain eq "WD40" and $FIEdomain == 0) {$WD40domain = 1;} + + # Ignore similar domains part 1 + + if ($extracted_domain eq "Myb_DNA-binding" and $G2domain == 0) {$MYBdomain = 1;} + if ($extracted_domain eq "G2-like_Domain" and $MYBdomain == 0) {$G2domain = 1;} + + if ($extracted_domain eq "PHD" and $Alfindomain == 0 and $C1domain == 0) {$PHDdomain = 1;} + if ($extracted_domain eq "Alfin-like" and $PHDdomain == 0 and $C1domain == 0) {$Alfindomain = 1;} + if ($extracted_domain eq "C1_2" and $Alfindomain == 0 and $PHDdomain == 0) {$C1domain = 1;} + + if ($extracted_domain eq "GATA" and $Dofdomain == 0) {$GATAdomain = 1;} + if ($extracted_domain eq "zf-Dof" and $GATAdomain == 0) {$Dofdomain = 1;} + + # Ignore similar domains part 2 + + if ($extracted_domain eq "FIE_clipped_for_HMM" and $WD40domain == 1) {next;} + if ($extracted_domain eq "Myb_DNA-binding" and $G2domain == 1) {next;} + if ($extracted_domain eq "G2-like_Domain" and $MYBdomain == 1) {next;} + + if ($extracted_domain eq "PHD" and ($Alfindomain == 1 or $C1domain == 1)) {next;} + if ($extracted_domain eq "Alfin-like" and ($PHDdomain == 1 or $C1domain ==1)) {next;} + if ($extracted_domain eq "C1_2" and ($PHDdomain == 1 or $Alfindomain == 1)) {next;} + + if ($extracted_domain eq "GATA" and $Dofdomain == 1) {next;} + if ($extracted_domain eq "zf-Dof" and $GATAdomain == 1) {next;} + + # Save the entry + push @results_of_extraction, $whole_entry; + $entry_counter++; +} +print "$entry_counter domain matches were found in $hmmsearch_output\n\n"; + +# Sort the entries: +my @sorted_results_of_extraction = @results_of_extraction; +print "*** Preparing hmmsearch results for classification ***\n\n"; + +# This array will be filled with the sorted proteins +my $array_of_arrays = []; + +# String for the previous protein name +my $old_entry = ""; + +# Coordinates for array filling +my $i = -1; +my $j = 0; + +@dec_table = get_file_data("$decision_table"); + +if ($dec_table [0] !~ /^[^;]+;[^;]+;[^;]/) { + print "wrong file format. $decision_table has to be in the format family;domain;type\n\n"; + exit; +} + +# Formate the entries +foreach my $domain_entry (@sorted_results_of_extraction) { + + $domain_entry =~ /^([^;]+);/; + $akt_entry = $1; + # If the protein has more than one domain push it behind the old entry + if ($old_entry eq $akt_entry) { + $j++; + $array_of_arrays->[$i][$j] = $domain_entry; + } + + # If the protein is new reset $j and push the entry in the next array + else { + $j = 0; + $i++; + $array_of_arrays->[$i][$j] = $domain_entry; + } + $old_entry = $akt_entry; + +} +$old_entry = ""; + +# Remember the number of entries to use them later +my $number_of_proteins = $i+1; +print "$number_of_proteins proteins were found in $hmmsearch_output\n"; +$number_of_proteins--; + +######################################### +### 3. Get the classification entries ### +######################################### + +# Make a list of all families in the classification rules file +my @liste_alle_familien = ('0_no_family_found'); + +my $ARR_B_in_list = 0; +my $bZIP_in_list = 0; +my $ET_in_list = 0; + +my $array_of_classifications = []; +$i = -1; +$j = 0; + +my @classifications_input = get_file_data("$decision_table"); + +# Sort the classifications +my @classifications = sort @classifications_input; + + +foreach my $classification (@classifications) { + $classification =~ /^([^;]+);/; + $akt_entry = $1; + + # If the family has more than one domain entry push it behind the old entry + if ($old_entry eq $akt_entry) { + + $j++; + $array_of_classifications->[$i][$j] = $classification; + } + + # If the family is new push it in the next array and reset $j + else { + $j = 0; + $i++; + $array_of_classifications->[$i][$j] = $classification; + # Add family to list of all families in the classification rules + push(@liste_alle_familien,$akt_entry); + + # Add hardcoded "OR" defined families to list of families if subfamily is present + # Do this just once for each family + if (($akt_entry eq "GARP_ARR-B_Myb") or ($akt_entry eq "GARP_ARR-B_G2")) { + if ($ARR_B_in_list == 0) { + push(@liste_alle_familien,"GARP_ARR-B"); + $ARR_B_in_list++; + }} + if (($akt_entry eq "bZIP1") or ($akt_entry eq "bZIP2") or ($akt_entry eq "bZIPAUREO") or ($akt_entry eq "bZIPCDD")) { + if ($bZIP_in_list == 0) { + push(@liste_alle_familien,"bZIP"); + $bZIP_in_list++; + }} + if (($akt_entry eq "HRT") or ($akt_entry eq "GIY_YIG")) { + if ($ET_in_list == 0) { + push(@liste_alle_familien,"ET"); + $ET_in_list++; + }} + } + $old_entry = $akt_entry; +} + +my $number_of_classifications = $i+1; +print "$number_of_classifications different families were found in $decision_table\n\n"; +$number_of_classifications--; + +print "*** begin classification ***\n\n"; + +# Create the output files (output.1 and output.3) +my $outputfile = "$family_classifications"; +my $subfamilyoutput = "$subfamily_classifications"; + +unless (open(FAMILY_CLASSIFICATIONS, ">$outputfile")) { + print "Cannot open file \"$outputfile\" to write to!!\n\n"; + exit; +} + +unless (open(SUBFAMILY_CLASSIFICATIONS, ">$subfamilyoutput")) { + print "Cannot open file \"$subfamilyoutput\" to write to!!\n\n"; + exit; +} + +print FAMILY_CLASSIFICATIONS "family classifications for $hmmsearch_output\n"; +print SUBFAMILY_CLASSIFICATIONS "family classifications for $hmmsearch_output\n"; + +# Counter for classified families +my $classified_families = 0; +# Counter for unclassified families +my $unclassified_families = 0; +# Counter for the different amount of entries in classification possibilities +my @entry_counter = []; +# Counter for the possible classifications for the current protein +my $possibilities = 0; +# For later famliy statistics +my @family_list = []; +# This array will be filled with the result. Later written in output.1 and output.3 +my @family_classifications_file = []; +my @subfamily_classifications_file = []; + +# One time for every protein in array of arrays +for (my $ii = 0; $ii <= $number_of_proteins; $ii++) { + my $jj = 0; + $possibilities = 0; + + # Flag to mark that any family was found for the current protein + my $member_found = 0; + + # In this string all the domains of one protein will be stored + # The ; signs are important to mark a single entry (for example to differ AP2 and TF_AP2) + my $domains = ";"; + + # Get the domains in all elements of the current protein array: + while ($array_of_arrays->[$ii][$jj]) { + + $array_of_arrays->[$ii][$jj] =~ /([^;]+);[^;]+;(\d+)$/; + # For discrimination between MYB and MYB-related. If more than 1 MYB domains was found replaced "Myb_DNA-binding" with "two_or_more_Myb_DNA-binding" + # Same for LIM + if ($1 eq "Myb_DNA-binding" and $2 eq 2) { + $domains .= "MYB-2R;Myb_DNA-binding;"; + } + elsif ($1 eq "Myb_DNA-binding" and $2 eq 3) { + $domains .= "MYB-3R;Myb_DNA-binding;"; + } + elsif ($1 eq "Myb_DNA-binding" and $2 eq 4) { + $domains .= "MYB-4R;Myb_DNA-binding;"; + } + elsif ($1 eq "LIM" and $2 > 1) { + $domains .= "two_or_more_LIM;LIM;"; + } + else { + $domains .= "$1;"; + } + $jj++; + } + +######################### +### 4. Classification ### +######################### + +# Get the name of the actual protein for the output after classification: +$array_of_arrays->[$ii][0] =~ /^([^;]+);/; +$akt_entry = $1; + +# Variables for double classification decisions: + +# Here the shoulds for the actual family will be stored +my $shoulds_act_family = ";"; +# Here the shoulds of the previous CLASSIFIED family will be stored +my $shoulds_prev_family = ""; + +# This loop will be made for every family entry in output.1 and output.3 +for ($i = 0;$i <= $number_of_classifications; $i++) { + $j = 0; + + # Flag to mark if the protein is a possible member of the current family + my $possible_member = 0; + + # Reset the domain entries for the current family + $shoulds_act_family = ";"; + + # This loop will be made for every domain classification entry for the current family + while ($array_of_classifications->[$i][$j]) { + + # For "should" entries + if ($array_of_classifications->[$i][$j] =~ /;should$/) { + + # Get the domain from the classification entry + $array_of_classifications->[$i][$j] =~ /^[^;]+;([^;]+);[^;]+/; + + my $domain_classification_entry = $1; + # Check if the protein might be a member of the current family. If yes check the next entry + if ($domains =~ /;$domain_classification_entry;/) { + $possible_member = 1; + $j++; + # All matched domains for the current family are stored here: + $shoulds_act_family .= "$domain_classification_entry;"; + next; + } + $possible_member = 0; + $j++; + last; + } + + # For "should not" entries + if ($array_of_classifications->[$i][$j] =~ /;should not$/) { + + # Get the domain from the classification entry + $array_of_classifications->[$i][$j] =~ /^[^;]+;([^;]+);[^;]+/; + + # Check if the protein has the domain. If yes go to the next family + my $domain_check = $1; + if ($domains =~ /;$domain_check;/) { + $possible_member = 0; + last; + } + # If not check the next entry + else { + $j++; + next; + } + } + + # This happens when there is no "should" or "should not" entry at the right position + print "error in classification entry $array_of_classifications->[$i][$j]\n" + } + + # If the check was successful print the found classification + if ($possible_member) { + + $array_of_classifications->[$i][0] =~ /^([^;]+);/; + my $currentFamily = $1; + $possibilities++; + # The double classified proteins solution: + if ($possibilities > 1) { + + # Fill an array with the current domains in $domains + my @domains_array = []; + my $number_of_domains = ($domains =~ tr/;/;/)-1; + my $tempdomains = $domains; + for (my $domain_counter = 0;$domain_counter != $number_of_domains;$domain_counter++) { + + $tempdomains =~ /^;([^;]+);/; + $domains_array[$domain_counter] = $1; + # Remove the inserted domain from domains + $tempdomains =~ s/^;[^;]+;/;/; + } + # Compare the two possible families (actual and previous family) + my $array_counter = 0; + while ($array_counter < $number_of_domains) { + if ($shoulds_prev_family =~ /$domains_array[$array_counter]/) { + if ($shoulds_act_family =~ /$domains_array[$array_counter]/) { + + # If actual domain is in previous and actual family go to next domain + $array_counter++; + next; + + } + # Protein was right classified in the previous classification. The actual entry will be ignored + + $possibilities--; + last; + } + + # If actual domain is in the actual family but not in the previous. The actual classification is right + if ($shoulds_act_family =~ /$domains_array[$array_counter]/) { + + pop @family_classifications_file; + pop @subfamily_classifications_file; + + # Merge "GARBP_ARR-B_Myb" and "GARP_ARR-B_G2" to "GARP_ARR-B" + if ( ($currentFamily eq "GARP_ARR-B_Myb") or ($currentFamily eq "GARP_ARR-B_G2")){ + + push @family_classifications_file, "$akt_entry;GARP_ARR-B;$possibilities$domains\n"; + push @subfamily_classifications_file, "$akt_entry;GARP_ARR-B;-;$possibilities$domains\n"; + if ($ARR_B_counter == 0) { + push @liste_alle_familien, 'GARP_ARR-B'; + $ARR_B_counter++; + } + } + elsif ( ($currentFamily eq "bZIP1") or ($currentFamily eq "bZIP2") or ($currentFamily eq "bZIPAUREO") or ($currentFamily eq "bZIPCDD")) { + + push @family_classifications_file, "$akt_entry;bZIP;$possibilities$domains\n"; + push @subfamily_classifications_file, "$akt_entry;bZIP;-;$possibilities$domains\n"; + if ($bZIP_counter == 0) { + push @liste_alle_familien, 'bZIP'; + $bZIP_counter++; + } + } + elsif ( ($currentFamily eq "HRT") or ($currentFamily eq "GIY_YIG")) { + push @subfamily_classifications_file, "$akt_entry;ET;-;$possibilities$domains\n"; + push @family_classifications_file, "$akt_entry;ET;$possibilities$domains\n"; + if ($ET_counter == 0) { + push @liste_alle_familien, 'ET'; + $ET_counter++; + } + } + else { + + push @family_classifications_file, "$akt_entry;$currentFamily;$possibilities$domains\n"; + push @subfamily_classifications_file, "$akt_entry;$currentFamily;-;$possibilities$domains\n"; + } + $possibilities--; + last; + + } + + $array_counter++; + } + + } + # Normal enty: if the family is classified for one family write it in the array + else { + + # Store the matched domains of the actual family for respectively later doubly classification decision + $shoulds_prev_family = $shoulds_act_family; + # Store the entry in the output array + # Merge "GARBP_ARR-B_Myb" and "GARP_ARR-B_G2" to "GARP_ARR-B" + # Add GARP_ARR-B to @liste_alle_familien ONCE + + if ( ($currentFamily eq "GARP_ARR-B_Myb") or ($currentFamily eq "GARP_ARR-B_G2")){ + push @family_classifications_file, "$akt_entry;GARP_ARR-B;$possibilities$domains\n"; + push @subfamily_classifications_file, "$akt_entry;GARP_ARR-B;-;$possibilities$domains\n"; + if ($ARR_B_counter == 0) { + push @liste_alle_familien, 'GARP_ARR-B'; + $ARR_B_counter++; + } + } + elsif (($currentFamily eq "bZIP1") or ($currentFamily eq "bZIP2") or ($currentFamily eq "bZIPAUREO") or ($currentFamily eq "bZIPCDD")){ + push @family_classifications_file, "$akt_entry;bZIP;$possibilities$domains\n"; + push @subfamily_classifications_file, "$akt_entry;bZIP;-;$possibilities$domains\n"; + if ($bZIP_counter == 0) { + push @liste_alle_familien, 'bZIP'; + $bZIP_counter++; + } + } + elsif (($currentFamily eq "HRT") or ($currentFamily eq "GIY_YIG")){ + push @family_classifications_file, "$akt_entry;ET;$possibilities$domains\n"; + push @subfamily_classifications_file, "$akt_entry;ET;-;$possibilities$domains\n"; + if ($ET_counter == 0) { + push @liste_alle_familien, 'ET'; + $ET_counter++; + } + } + else { + + push @family_classifications_file, "$akt_entry;$currentFamily;$possibilities$domains\n"; + push @subfamily_classifications_file, "$akt_entry;$currentFamily;-;$possibilities$domains\n"; + } + + $classified_families++; + } + + # Flag to mark that any family was found for the current protein + $member_found = 1; + } + + } + + if ($possibilities == 0) { + + # If no family was found for the current protein give this out + push @family_classifications_file, "$akt_entry;0_no_family_found;0$domains\n"; + push @subfamily_classifications_file, "$akt_entry;0_no_family_found;-;0$domains\n"; + $unclassified_families++; + } + + # Increment the number of possible members in the array at the right position + else { + + my $temp = $entry_counter[$possibilities]; + $temp++; + $entry_counter[$possibilities] = $temp; + } + +} +# Define for which TAP families also subfamilies may be present: + +foreach my $subfamily_classifications_file (@subfamily_classifications_file) { + + # If "C2H2" was assigned to a sequence, write "C2H2;C2H2" to output.3 as TAP family and subfamily, respectively. + if ($subfamily_classifications_file =~ /C2H2/ ) { + $subfamily_classifications_file =~ s/C2H2;-/C2H2;C2H2/ + } + # If "C2H2_IDD" was assigned to a sequence, write "C2H2;C2H2_IDD" to output.3 as TAP family and subfamily, respectively. + if ($subfamily_classifications_file =~ /C2H2_IDD/ ) { + $subfamily_classifications_file =~ s/C2H2_IDD;-/C2H2;C2H2_IDD/ + } + if ($subfamily_classifications_file =~ /bHLH/ ) { + $subfamily_classifications_file =~ s/bHLH;-/bHLH;bHLH/ + } + if ($subfamily_classifications_file =~ /bHLH_TCP/ ) { + $subfamily_classifications_file =~ s/bHLH_TCP;-/bHLH;bHLH_TCP/ + } + if ($subfamily_classifications_file =~ /NF-YA/ ) { + $subfamily_classifications_file =~ s/NF-YA;-/NFY;NF-YA/ + } + if ($subfamily_classifications_file =~ /NF-YB/ ) { + $subfamily_classifications_file =~ s/NF-YB;-/NFY;NF-YB/ + } + if ($subfamily_classifications_file =~ /NF-YC/ ) { + $subfamily_classifications_file =~ s/NF-YC;-/NFY;NF-YC/ + } + if ($subfamily_classifications_file =~ /C1HDZ/ ) { + $subfamily_classifications_file =~ s/C1HDZ;-/HDZ;C1HDZ/ + } + if ($subfamily_classifications_file =~ /C2HDZ/ ) { + $subfamily_classifications_file =~ s/C2HDZ;-/HDZ;C2HDZ/ + } + if ($subfamily_classifications_file =~ /C3HDZ/ ) { + $subfamily_classifications_file =~ s/C3HDZ;-/HDZ;C3HDZ/ + } + if ($subfamily_classifications_file =~ /C4HDZ/ ) { + $subfamily_classifications_file =~ s/C4HDZ;-/HDZ;C4HDZ/ + } + if ($subfamily_classifications_file =~ /MYST/ ) { + $subfamily_classifications_file =~ s/MYST;-/HAT;MYST/ + } + if ($subfamily_classifications_file =~ /CBP/ ) { + $subfamily_classifications_file =~ s/CBP;-/HAT;CBP/ + } + if ($subfamily_classifications_file =~ /TAFII250/ ) { + $subfamily_classifications_file =~ s/TAFII250;-/HAT;TAFII250/ + } + if ($subfamily_classifications_file =~ /GNAT/ ) { + $subfamily_classifications_file =~ s/GNAT;-/HAT;GNAT/ + } + if ($subfamily_classifications_file =~ /LOB1/ ) { + $subfamily_classifications_file =~ s/LOB1;-/LBD;LOB1/ + } + if ($subfamily_classifications_file =~ /LOB2/ ) { + $subfamily_classifications_file =~ s/LOB2;-/LBD;LOB2/ + } + if ($subfamily_classifications_file =~ /MYB-related/ ) { + $subfamily_classifications_file =~ s/MYB-related;-/MYB;MYB-related/ + } + if ($subfamily_classifications_file =~ /SWI\/SNF_SWI3/ ) { + $subfamily_classifications_file =~ s/SWI\/SNF_SWI3;-/MYB-related;SWI\/SNF_SWI3/ + } + if ($subfamily_classifications_file =~ /MYB-2R/ ) { + $subfamily_classifications_file =~ s/MYB-2R;-/MYB;MYB-2R/ + } + if ($subfamily_classifications_file =~ /MYB-3R/ ) { + $subfamily_classifications_file =~ s/MYB-3R;-/MYB;MYB-3R/ + } + if ($subfamily_classifications_file =~ /MYB-4R/ ) { + $subfamily_classifications_file =~ s/MYB-4R;-/MYB;MYB-4R/ + } + if ($subfamily_classifications_file =~ /RKD/ ) { + $subfamily_classifications_file =~ s/RKD;-/RWP-RK;RKD/ + } + if ($subfamily_classifications_file =~ /NLP/ ) { + $subfamily_classifications_file =~ s/NLP;-/RWP-RK;NLP/ + } + if ($subfamily_classifications_file =~ /AP2/ ) { + $subfamily_classifications_file =~ s/AP2;-/AP2;AP2/ + } + if ($subfamily_classifications_file =~ /CRF/ ) { + $subfamily_classifications_file =~ s/CRF;-/AP2;CRF/ + } +} + +### 4.1 Filter for gene models + +# Filter for gene models. All alternative splice variants will be removed from Arabidopsis and rice entries +shift @family_classifications_file; +shift @subfamily_classifications_file; +# Sort the array for filter and a sorted output +@family_classifications_file = sort @family_classifications_file; +@subfamily_classifications_file = sort @subfamily_classifications_file; +if ($gene_model_filter and $gene_model_filter eq "filter") { + print "*** filtering gene models: removing splice variants ***\n"; + + # temporary array for the filtered entries + my @filtered_entries = []; + my $models_removed_counter = 0; + + my $prev_entry = ""; + foreach my $entry_line (@family_classifications_file) { + if ($entry_line !~ /^([^.]+).\d+/) { + print "Bad format for filtering splice variants. Use \"filter\" only for TAIR (Aradopsis) or TIGR (Rice) files.\n\n"; + exit; + } + if ($1 eq $prev_entry) { + $models_removed_counter++; + next; + } + $prev_entry = $1; + push @filtered_entries, $entry_line; + } + print " -> $models_removed_counter gene models were removed\n\n"; + @family_classifications_file = @filtered_entries; + shift @family_classifications_file; +} + +# Print the results in the array to output.1 and output.3 and push the names of the TAP families and Subfamilies into $family_list to create output.2 +foreach my $fcf_line (@family_classifications_file) { + $fcf_line =~ /^[^;]+;([^;]+)/; + #push @family_list, "$1"; + print FAMILY_CLASSIFICATIONS "$fcf_line"; +} +close (FAMILY_CLASSIFICATIONS); + +foreach my $fcf_line (@subfamily_classifications_file) { + $fcf_line =~ /^[^;]+;([^;]+);([^;]+)/; + if ($2 eq "-") { + push @family_list, "$1"; + } + else { + push @family_list, "$2"; + } + print SUBFAMILY_CLASSIFICATIONS "$fcf_line"; +} + +close (SUBFAMILY_CLASSIFICATIONS); + +print "*** calculating the family statistics and write it in $family_statistics ***\n\n"; + +################################## +### 5. Create the output files ### +################################## + +my $statistics_outputfile = "$family_statistics"; + +unless (open(FAMILY_STATISTICS, ">$statistics_outputfile")) { + print "Cannot open file \"$statistics_outputfile\" to write to!!\n\n"; + exit; +} + +# Count the family entries +my @output_family_statistics = (); +my @gefundene_familien = (); +my $family_counter = 1; + +shift @family_list; +@family_list = sort @family_list; + + +my $old_family = ""; +push @family_list, 'BAD FIX'; # Makes to loop go through every fam and stops at non fam(BAD FIX). + +foreach my $line (@family_list) { + if ($line eq $old_family) { + $family_counter++; + } + elsif ($old_family ne "") { + push (@output_family_statistics,"$old_family;$family_counter\n"); + # Add all found families to a list of found families + push (@gefundene_familien,"$old_family"); + $family_counter=1; + } + $old_family = $line; +} + +my %hash = (); + +# Put all families from the classifictaion ruled and all found families in a hash +foreach my $element (@gefundene_familien,@liste_alle_familien) {$hash{$element}++;} + +# Remove merged families +delete $hash{'GARP_ARR-B_Myb'}; +delete $hash{'GARP_ARR-B_G2'}; +delete $hash{'bZIP1'}; +delete $hash{'bZIP2'}; +delete $hash{'bZIPAUREO'}; +delete $hash{'bZIPCDD'}; +delete $hash{'HRT'}; +delete $hash{'GIY_YIG'}; + +# Add all not found families from the classification rules to @output_family_statistics +# With zero as number of families found + +foreach my $element (keys %hash) { + if (($hash{$element} == 1) and ( $element eq "0_no_family_found")) { + push (@output_family_statistics,"$element;$unclassified_families\n"); + } + if (($hash{$element} == 1) and ($element ne "0_no_family_found")) { + push (@output_family_statistics,"$element;0\n"); + } +} + +# Sort @output_family_statistics caseinsensitive-alphabetically +my @sortierte_statistik = sort {lc $a cmp lc $b} @output_family_statistics; + +# Print headline to @sortierte_statistik +unshift (@sortierte_statistik,"family statistics for $hmmsearch_output\n"); + +print FAMILY_STATISTICS @sortierte_statistik; + +# Print FAMILY_STATISTICS "$old_family;$family_counter\n"; + +close (FAMILY_STATISTICS); + +################################################# +### 6. Give out some statistical informations ### +################################################# + +$entry_counter [0] = 0; +my $sum = 0; +foreach my $entry (@entry_counter) { + $sum += $entry; +} +print "$classified_families classifications were found for $sum proteins.\n"; +print "This classifications are divided in:\n"; +my $count = 0; +foreach my $element (@entry_counter) { + if ($count != 0) { + print "$element proteins were classified for $count"; + if ($count == 1) {print " family\n";} + else {print " different families\n";} + } + $count++; +} +print "\n$unclassified_families proteins could not be classified\n\n"; + +print "*** The results were written in $family_classifications and $subfamily_classifications ***\n"; +print "*** done ***\n\n"; + + +exit; + +sub get_file_data { + + my ($filename) = @_; + + use strict; + use warnings; + + my @filedata = (); + + unless( open(GET_FILE_DATA, $filename)) { + print STDERR "Cannot open file \"$filename\"n\n"; + exit; + } + + @filedata = <GET_FILE_DATA>; + + close GET_FILE_DATA; + + return @filedata; +} + +
--- a/tapscan_coverage_values_v10.txt Wed Feb 14 13:54:16 2024 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,154 +0,0 @@ -Acetyltransf_1 0.0931034483 -AP2 0.3214285714 -ARID 0.0516393443 -AUX_IAA 0.1077981651 -Auxin_resp 0.4308510638 -B3 0.196875 -BES1_N 0.3888888889 -BSD 0.485915493 -BTB 0.3105095541 -bZIP_1 0.64453125 -bZIP_2 0.5948275862 -C1_2 0.1612903226 -CAF1C_H4-bd 0.1764705882 -CBFB_NFYA 0.2019230769 -CCT 0.4952830189 -CG-1 0.5568181818 -CSD 0.5878378378 -DDT 0.2678571429 -DEAD 0.1115591398 -dsrm 0.2330097087 -DUF260 0.3638059701 -DUF296 0.155075188 -DUF547 0.1181672026 -DUF573 0.3515625 -DUF632 0.3302603037 -DUF702 0.2314126394 -E2F_TDP 0.2078313253 -EIN3 0.5874125874 -FHA 0.1777456647 -FLO_LFY 0.4569138277 -FYRC 0.3928571429 -FYRN 0.46875 -GAGA_bind 0.2265774379 -GATA 0.5406976744 -GRAS 0.3990825688 -Helicase_C 0.118852459 -HLH 0.3191489362 -HMG_box 0.6866197183 -Homeobox 0.5485074627 -HSF_DNA-bind 0.071641791 -IQ 0.6428571429 -JmjC 0.5058139535 -JmjN 0.4261363636 -K-box 0.438 -KNOX1 0.578125 -KNOX2 0.6346153846 -LIM 0.4726027397 -MBF1 0.3049450549 -Med26 0.1939655172 -Med31 0.451048951 -Med6 0.1577868852 -Med7 0.1818181818 -MEKHLA 0.4357541899 -mTERF 0.4850543478 -Myb_DNA-binding 0.3636363636 -NAM 0.0142857143 -O-FucT 0.0914866582 -Ovate 0.3137755102 -PAH 0.1194267516 -PAZ 0.1564569536 -PC4 0.3848684211 -PHD 0.37 -Piwi 0.4082278481 -PLATZ 0.30625 -PP2C 0.4110962567 -QLQ 0.61875 -RB_B 0.1663987138 -Rcd1 0.4131355932 -Response_reg 0.5017241379 -RFX_DNA_binding 0.5257731959 -RHD_DNA_bind 0.4581447964 -Ribonuclease_3 0.0935013263 -RRN3 0.2342427093 -Runt 0.5714285714 -RWP-RK 0.4542253521 -S1FA 0.7386363636 -SBP 0.4274193548 -SET 0.0392857143 -SH2 0.4197247706 -Sigma70_r2 0.1675531915 -Sigma70_r3 0.6346153846 -Sigma70_r4 0.6388888889 -SIR2 0.45703125 -SNF2_N 0.1150895141 -SRF-TF 0.406779661 -SSXT 0.5856164384 -START 0.5067567568 -STAT_bind 0.3781869688 -SWIB 0.3267326733 -SWIRM 0.2532467532 -TANGO2 0.125 -TCP 0.219665272 -TCR 0.4943181818 -TEA 0.4078282828 -TF_AP-2 0.5660377358 -Tfb2 0.1757668712 -tify 0.5955882353 -Tub 0.0872576177 -VEFS-Box 0.4918831169 -WD40 0.0563909774 -WHIM1 0.5263157895 -Whirly 0.5648148148 -WRC 0.6223404255 -WRKY 0.2196969697 -WSD 0.1768867925 -YABBY 0.2804621849 -zf-AN1 0.4027777778 -zf-B_box 0.2746478873 -zf-C2H2 0.5080645161 -zf-C5HC2 0.253125 -zf-CCCH 0.5689655172 -zf-Dof 0.5955882353 -ZF-HD_dimer 0.5294117647 -zf-MIZ 0.6805555556 -zf-TAZ 0.1566455696 -zf-ZPR1 0.1608910891 -Zn_clus 0.5480769231 -Alfin-like 0.75 -BEL 0.75 -DNC 0.75 -FIE_clipped_for_HMM 0.75 -G2-like_Domain 0.75 -HRT 0.75 -KNOXC 0.75 -LUFS_Domain 0.75 -NF-YB 0.75 -NF-YC 0.75 -NOZZLE 0.75 -PINTOX 0.75 -STER_AP 0.75 -trihelix 0.75 -ULT_Domain 0.75 -VARL 0.75 -VOZ_Domain 0.75 -WOX_HD 0.75 -CXC 0.75 -bZIP_AUREO 0.75 -bZIP_CDD 0.50 -ALOG 0.75 -C2H2-IDD 0.75 -zf-MYST 0.75 -CBP 0.75 -DUF3591 0.75 -LOB2 0.75 -zz-ADA2 0.75 -NLP 0.75 -CRF 0.75 -GIY_YIG 0.75 -ZPR 0.75 -LD 0.75 -NDX 0.75 -SAWADEE 0.75 -C1HDZ 0.75 -C2HDZ 0.75
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tapscan_coverage_values_v11.txt Thu Feb 22 10:07:53 2024 +0000 @@ -0,0 +1,155 @@ +Acetyltransf_1 0.0931034483 +AP2 0.3214285714 +ARID 0.0516393443 +AUX_IAA 0.1077981651 +Auxin_resp 0.4308510638 +B3 0.196875 +BES1_N 0.3888888889 +BSD 0.485915493 +BTB 0.3105095541 +bZIP_1 0.64453125 +bZIP_2 0.5948275862 +C1_2 0.1612903226 +CAF1C_H4-bd 0.1764705882 +CBFB_NFYA 0.2019230769 +CCT 0.4952830189 +CG-1 0.5568181818 +CSD 0.5878378378 +DDT 0.2678571429 +DEAD 0.1115591398 +dsrm 0.2330097087 +DUF260 0.3638059701 +DUF296 0.155075188 +DUF547 0.1181672026 +DUF573 0.3515625 +DUF632 0.3302603037 +DUF702 0.2314126394 +E2F_TDP 0.2078313253 +EIN3 0.5874125874 +FHA 0.1777456647 +FLO_LFY 0.4569138277 +FYRC 0.3928571429 +FYRN 0.46875 +GAGA_bind 0.2265774379 +GATA 0.5406976744 +GRAS 0.3990825688 +Helicase_C 0.118852459 +HLH 0.3191489362 +HMG_box 0.6866197183 +Homeobox 0.5485074627 +HSF_DNA-bind 0.071641791 +IQ 0.6428571429 +JmjC 0.5058139535 +JmjN 0.4261363636 +K-box 0.438 +KNOX1 0.578125 +KNOX2 0.6346153846 +LIM 0.4726027397 +MBF1 0.3049450549 +Med26 0.1939655172 +Med31 0.451048951 +Med6 0.1577868852 +Med7 0.1818181818 +MEKHLA 0.4357541899 +mTERF 0.4850543478 +Myb_DNA-binding 0.3636363636 +NAM 0.0142857143 +O-FucT 0.0914866582 +Ovate 0.3137755102 +PAH 0.1194267516 +PAZ 0.1564569536 +PC4 0.3848684211 +PHD 0.37 +Piwi 0.4082278481 +PLATZ 0.30625 +PP2C 0.4110962567 +QLQ 0.61875 +RB_B 0.1663987138 +Rcd1 0.4131355932 +Response_reg 0.5017241379 +RFX_DNA_binding 0.5257731959 +RHD_DNA_bind 0.4581447964 +Ribonuclease_3 0.0935013263 +RRN3 0.2342427093 +Runt 0.5714285714 +RWP-RK 0.4542253521 +S1FA 0.7386363636 +SBP 0.4274193548 +SET 0.0392857143 +SH2 0.4197247706 +Sigma70_r2 0.1675531915 +Sigma70_r3 0.6346153846 +Sigma70_r4 0.6388888889 +SIR2 0.45703125 +SNF2_N 0.1150895141 +SRF-TF 0.406779661 +SSXT 0.5856164384 +START 0.5067567568 +STAT_bind 0.3781869688 +SWIB 0.3267326733 +SWIRM 0.2532467532 +TANGO2 0.125 +TCP 0.219665272 +TCR 0.4943181818 +TEA 0.4078282828 +TF_AP-2 0.5660377358 +Tfb2 0.1757668712 +tify 0.5955882353 +Tub 0.0872576177 +VEFS-Box 0.4918831169 +WD40 0.0563909774 +WHIM1 0.5263157895 +Whirly 0.5648148148 +WRC 0.6223404255 +WRKY 0.2196969697 +WSD 0.1768867925 +YABBY 0.2804621849 +zf-AN1 0.4027777778 +zf-B_box 0.2746478873 +zf-C2H2 0.5080645161 +zf-C5HC2 0.253125 +zf-CCCH 0.5689655172 +zf-Dof 0.5955882353 +ZF-HD_dimer 0.5294117647 +zf-MIZ 0.6805555556 +zf-TAZ 0.1566455696 +zf-ZPR1 0.1608910891 +Zn_clus 0.5480769231 +Alfin-like 0.75 +BEL 0.75 +DNC 0.75 +FIE_clipped_for_HMM 0.75 +G2-like_Domain 0.75 +HRT 0.75 +KNOXC 0.75 +LUFS_Domain 0.75 +NF-YB 0.75 +NF-YC 0.75 +NOZZLE 0.75 +PINTOX 0.75 +STER_AP 0.75 +trihelix 0.75 +ULT_Domain 0.75 +VARL 0.75 +VOZ_Domain 0.75 +WOX_HD 0.75 +CXC 0.75 +bZIP_AUREO 0.75 +bZIP_CDD 0.50 +ALOG 0.75 +C2H2-IDD 0.75 +zf-MYST 0.75 +CBP 0.75 +DUF3591 0.75 +LOB2 0.75 +zz-ADA2 0.75 +NLP 0.75 +CRF 0.75 +GIY_YIG 0.75 +ZPR 0.75 +LD 0.75 +NDX 0.75 +SAWADEE 0.75 +C1HDZ 0.75 +C2HDZ 0.75 +Homeobox_KN 0.75
--- a/tapscan_rules_v81.txt Wed Feb 14 13:54:16 2024 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,298 +0,0 @@ -ABI3/VP1;AP2;should not -ABI3/VP1;Auxin_resp;should not -ABI3/VP1;B3;should -ABI3/VP1;WRKY;should not -Alfin-like;Alfin-like;should -Alfin-like;Homeobox;should not -Alfin-like;zf-TAZ;should not -Alfin-like;PHD;should not -AP2;AP2;should -AP2;CRF;should not -ARF;Auxin_resp;should -Argonaute;Piwi;should -Argonaute;PAZ;should -ARID;ARID;should -Aux/IAA;AUX_IAA;should -Aux/IAA;Auxin_resp;should not -Aux/IAA;B3;should not -BBR/BPC;GAGA_bind;should -BES1;BES1_N;should -bHLH;HLH;should -bHLH;TCP;should not -bHLH_TCP;TCP;should -bHSH;TF_AP-2;should -BSD domain containing;BSD;should -bZIP1;bZIP_1;should -bZIP1;HLH;should not -bZIP1;Homeobox;should not -bZIP2;bZIP_2;should -bZIP2;HLH;should not -bZIP2;Homeobox;should not -bZIPAUREO;bZIP_AUREO;should -bZIPAUREO;HLH;should not -bZIPAUREO;Homeobox;should not -bZIPCDD;bZIP_CDD;should -bZIPCDD;HLH;should not -bZIPCDD;Homeobox;should not -C2C2_CO-like;CCT;should -C2C2_CO-like;GATA;should not -C2C2_CO-like;tify;should not -C2C2_CO-like;PLATZ;should not -C2C2_CO-like;zf-B_box;should -C2C2_Dof;zf-Dof;should -C2C2_Dof;GATA;should not -C2C2_GATA;GATA;should -C2C2_GATA;tify;should not -C2C2_GATA;zf-Dof;should not -C2C2_YABBY;YABBY;should -C2H2;zf-C2H2;should -C2H2;zf-MIZ;should not -C3H;AP2;should not -C3H;SRF-TF;should not -C3H;MYB-2R;should not -C3H;MYB-3R;should not -C3H;MYB-4R;should not -C3H;zf-C2H2;should not -C3H;zf-CCCH;should -CAMTA;CG-1;should -CAMTA;IQ;should -NF-YA;bZIP_1;should not -NF-YA;bZIP_2;should not -NF-YA;CBFB_NFYA;should -NF-YB;NF-YB;should -NF-YB;NF-YC;should not -NF-YC;NF-YB;should not -NF-YC;NF-YC;should -NF-YC;HMG_box;should not -Coactivator p15;PC4;should -CPP;TCR;should -CSD;CSD;should -CudA;STAT_bind;should -CudA;SH2;should -DBP;DNC;should -DBP;PP2C;should -DDT;DDT;should -DDT;Homeobox;should not -DDT;Alfin-like;should not -Dicer;Piwi;should not -Dicer;DEAD;should -Dicer;Helicase_C;should -Dicer;Ribonuclease_3;should -Dicer;dsrm;should -DUF246 domain containing/O-FucT;O-FucT;should -DUF296 domain containing;DUF296;should -DUF547 domain containing;DUF547;should -DUF632 domain containing;DUF632;should -DUF833 domain containing/TANGO2;TANGO2;should -E2F/DP;E2F_TDP;should -EIL;EIN3;should -FHA;FHA;should -GARP_ARR-B_G2;CCT;should not -GARP_ARR-B_G2;G2-like_Domain;should -GARP_ARR-B_G2;Response_reg;should -GARP_ARR-B_Myb;CCT;should not -GARP_ARR-B_Myb;Response_reg;should -GARP_ARR-B_Myb;Myb_DNA-binding;should -GARP_G2-like;G2-like_Domain;should -GARP_G2-like;Response_reg;should not -GARP_G2-like;Myb_DNA-binding;should not -GeBP;DUF573;should -GIF;SSXT;should -GNAT;Acetyltransf_1;should -GNAT;PHD;should not -GRAS;GRAS;should -GRF;QLQ;should -GRF;WRC;should -C3HDZ;Homeobox;should -C3HDZ;START;should -C3HDZ;MEKHLA;should -C4HDZ;Homeobox;should -C4HDZ;START;should -C4HDZ;MEKHLA;should not -HD_PLINC;ZF-HD_dimer;should -HD_WOX;WOX_HD;should -HD_DDT;Homeobox;should -HD_DDT;DDT;should -HD_DDT;WHIM1;should -HD_DDT;WSD;should -HD_PHD;PHD;should -HD_PHD;Homeobox;should -HD_PINTOX;Homeobox;should -HD_PINTOX;PINTOX;should -HD_BEL;Homeobox;should -HD_BEL;BEL;should -HD_KNOX1;Homeobox;should -HD_KNOX1;KNOX1;should -HD_KNOX1;KNOX2;should -HD_KNOX1;KNOXC;should not -HD_KNOX2;Homeobox;should -HD_KNOX2;KNOX1;should -HD_KNOX2;KNOX2;should -HD_KNOX2;KNOXC;should -HD-other;EIN3;should not -HD-other;Homeobox;should -HD-other;bZIP_1;should not -HD-other;WOX_HD;should not -HD-other;PINTOX;should not -HD-other;PHD;should not -HD-other;BEL;should not -HMG;ARID;should not -HMG;HMG_box;should -HMG;YABBY;should not -HRT;HRT;should -HSF;HSF_DNA-bind;should -IWS1;Med26;should -Jumonji_PKDM7;JmjC;should -Jumonji_PKDM7;JmjN;should -Jumonji_PKDM7;zf-C5HC2;should -Jumonji_PKDM7;FYRN;should -Jumonji_PKDM7;FYRC;should -Jumonji_Other;JmjC;should -LFY;FLO_LFY;should -LIM;two_or_more_LIM;should -LUG;LUFS_Domain;should -MADS;SRF-TF;should -MADS;K-box;should not -MADS_MIKC;SRF-TF;should -MADS_MIKC;K-box;should -MBF1;MBF1;should -Med6;Med6;should -Med7;Med7;should -mTERF;mTERF;should -MYB-2R;G2-like_Domain;should not -MYB-2R;Response_reg;should not -MYB-2R;trihelix;should not -MYB-2R;MYB-2R;should -MYB-3R;G2-like_Domain;should not -MYB-3R;Response_reg;should not -MYB-3R;trihelix;should not -MYB-3R;MYB-3R;should -MYB-4R;G2-like_Domain;should not -MYB-4R;Response_reg;should not -MYB-4R;trihelix;should not -MYB-4R;MYB-4R;should -MYB-related;ARID;should not -MYB-related;G2-like_Domain;should not -MYB-related;Myb_DNA-binding;should -MYB-related;Response_reg;should not -MYB-related;trihelix;should not -MYB-related;MYB-2R;should not -MYB-related;MYB-3R;should not -MYB-related;MYB-4R;should not -NAC;NAM;should -NZZ;NOZZLE;should -OFP;Ovate;should -PcG_EZ;CXC;should -PcG_EZ;SET;should -PcG_FIE;FIE_clipped_for_HMM;should -PcG_FIE;WD40;should -PcG_VEFS;VEFS-Box;should -PcG_VEFS;zf-C2H2;should not -PcG_MSI;WD40;should -PcG_MSI;CAF1C_H4-bd;should -PcG_MSI;FIE_clipped_for_HMM;should not -PHD;Myb_DNA-binding;should not -PHD;Alfin-like;should not -PHD;ARID;should not -PHD;DDT;should not -PHD;Homeobox;should not -PHD;JmjC;should not -PHD;JmjN;should not -PHD;PHD;should -PHD;SWIB;should not -PHD;zf-TAZ;should not -PHD;zf-MIZ;should not -PHD;zf-CCCH;should not -PHD;HMG_box;should not -PLATZ;PLATZ;should -Pseudo ARR-B;CCT;should -Pseudo ARR-B;Response_reg;should -Pseudo ARR-B;tify;should not -RB;RB_B;should -Rcd1-like;Rcd1;should -Rel;RHD_DNA_bind;should -RF-X;RFX_DNA_binding;should -RRN3;RRN3;should -Runt;Runt;should -S1Fa-like;S1FA;should -SAP;STER_AP;should -SBP;SBP;should -SET;zf-C2H2;should not -SET;TCR;should not -SET;CXC;should not -SET;PHD;should not -SET;Myb_DNA-binding;should not -SET;SET;should -Sigma70-like;Sigma70_r2;should -Sigma70-like;Sigma70_r3;should -Sigma70-like;Sigma70_r4;should -Sin3;PAH;should -Sin3;WRKY;should not -Sir2;SIR2;should -SOH1;Med31;should -SRS;DUF702;should -SWI/SNF_BAF60b;SWIB;should -SWI/SNF_SNF2;AP2;should not -SWI/SNF_SNF2;PHD;should not -SWI/SNF_SNF2;SNF2_N;should -SWI/SNF_SNF2;zf-CCCH;should not -SWI/SNF_SNF2;Myb_DNA-binding;should not -SWI/SNF_SNF2;HMG_box;should not -SWI/SNF_SWI3;SWIRM;should -SWI/SNF_SWI3;Myb_DNA-binding;should -TEA;TEA;should -TFb2;Tfb2;should -tify;tify;should -TRAF;BTB;should -TRAF;zf-TAZ;should not -Trihelix;trihelix;should -TUB;Tub;should -ULT;ULT_Domain;should -VARL;VARL;should -VOZ;VOZ_Domain;should -Whirly;Whirly;should -WRKY;WRKY;should -Zinc finger, AN1 and A20 type;zf-AN1;should -Zinc finger, AN1 and A20 type;zf-C2H2;should not -Zinc finger, MIZ type;zf-MIZ;should -Zinc finger, MIZ type;zf-C2H2;should not -Zinc finger, ZPR1;zf-ZPR1;should -Zn_clus;Zn_clus;should -ALOG;ALOG;should -C2H2;C2H2-IDD;should not -C2H2_IDD;C2H2-IDD;should -C2H2_IDD;zf-C2H2;should -MYST;zf-MYST;should -CBP;CBP;should -CBP;zf-TAZ;should -CBP;BTB;should not -TAFII250;DUF3591;should -LOB1;bZIP_1;should not -LOB1;bZIP_2;should not -LOB1;DUF260;should -LOB1;HLH;should not -LOB1;Homeobox;should not -LOB2;LOB2;should -LDL/FLD;SWIRM;should -LDL/FLD;Myb_DNA-binding;should not -ADA2;zz-ADA2;should -ADA2;Myb_DNA-binding;should -RKD;RWP-RK;should -RKD;NLP;should not -NLP;RWP-RK;should -NLP;NLP;should -CRF;CRF;should -CRF;AP2;should -GIY_YIG;GIY_YIG;should -ZPR;ZPR;should -HD-LD;LD;should -HD-NDX;NDX;should -HD-SAWADEE;SAWADEE;should -C1HDZ;C1HDZ;should -C1HDZ;Homeobox;should -C1HDZ;START;should not -C1HDZ;MEKHLA;should not -C2HDZ;C2HDZ;should -C2HDZ;Homeobox;should -C2HDZ;START;should not -C2HDZ;MEKHLA;should not
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tapscan_rules_v82.txt Thu Feb 22 10:07:53 2024 +0000 @@ -0,0 +1,302 @@ +ABI3/VP1;AP2;should not +ABI3/VP1;Auxin_resp;should not +ABI3/VP1;B3;should +ABI3/VP1;WRKY;should not +Alfin-like;Alfin-like;should +Alfin-like;Homeobox;should not +Alfin-like;zf-TAZ;should not +Alfin-like;PHD;should not +AP2;AP2;should +AP2;CRF;should not +ARF;Auxin_resp;should +Argonaute;Piwi;should +Argonaute;PAZ;should +ARID;ARID;should +Aux/IAA;AUX_IAA;should +Aux/IAA;Auxin_resp;should not +Aux/IAA;B3;should not +BBR/BPC;GAGA_bind;should +BES1;BES1_N;should +bHLH;HLH;should +bHLH;TCP;should not +bHLH_TCP;TCP;should +bHSH;TF_AP-2;should +BSD domain containing;BSD;should +bZIP1;bZIP_1;should +bZIP1;HLH;should not +bZIP1;Homeobox;should not +bZIP2;bZIP_2;should +bZIP2;HLH;should not +bZIP2;Homeobox;should not +bZIPAUREO;bZIP_AUREO;should +bZIPAUREO;HLH;should not +bZIPAUREO;Homeobox;should not +bZIPCDD;bZIP_CDD;should +bZIPCDD;HLH;should not +bZIPCDD;Homeobox;should not +C2C2_CO-like;CCT;should +C2C2_CO-like;GATA;should not +C2C2_CO-like;tify;should not +C2C2_CO-like;PLATZ;should not +C2C2_CO-like;zf-B_box;should +C2C2_Dof;zf-Dof;should +C2C2_Dof;GATA;should not +C2C2_GATA;GATA;should +C2C2_GATA;tify;should not +C2C2_GATA;zf-Dof;should not +C2C2_YABBY;YABBY;should +C2H2;zf-C2H2;should +C2H2;zf-MIZ;should not +C3H;AP2;should not +C3H;SRF-TF;should not +C3H;MYB-2R;should not +C3H;MYB-3R;should not +C3H;MYB-4R;should not +C3H;zf-C2H2;should not +C3H;zf-CCCH;should +CAMTA;CG-1;should +CAMTA;IQ;should +NF-YA;bZIP_1;should not +NF-YA;bZIP_2;should not +NF-YA;CBFB_NFYA;should +NF-YB;NF-YB;should +NF-YB;NF-YC;should not +NF-YC;NF-YB;should not +NF-YC;NF-YC;should +NF-YC;HMG_box;should not +Coactivator p15;PC4;should +CPP;TCR;should +CSD;CSD;should +CudA;STAT_bind;should +CudA;SH2;should +DBP;DNC;should +DBP;PP2C;should +DDT;DDT;should +DDT;Homeobox;should not +DDT;Alfin-like;should not +Dicer;Piwi;should not +Dicer;DEAD;should +Dicer;Helicase_C;should +Dicer;Ribonuclease_3;should +Dicer;dsrm;should +DUF246 domain containing/O-FucT;O-FucT;should +DUF296 domain containing;DUF296;should +DUF547 domain containing;DUF547;should +DUF632 domain containing;DUF632;should +DUF833 domain containing/TANGO2;TANGO2;should +E2F/DP;E2F_TDP;should +EIL;EIN3;should +FHA;FHA;should +GARP_ARR-B_G2;CCT;should not +GARP_ARR-B_G2;G2-like_Domain;should +GARP_ARR-B_G2;Response_reg;should +GARP_ARR-B_Myb;CCT;should not +GARP_ARR-B_Myb;Response_reg;should +GARP_ARR-B_Myb;Myb_DNA-binding;should +GARP_G2-like;G2-like_Domain;should +GARP_G2-like;Response_reg;should not +GARP_G2-like;Myb_DNA-binding;should not +GeBP;DUF573;should +GIF;SSXT;should +GNAT;Acetyltransf_1;should +GNAT;PHD;should not +GRAS;GRAS;should +GRF;QLQ;should +GRF;WRC;should +C3HDZ;Homeobox;should +C3HDZ;START;should +C3HDZ;MEKHLA;should +C4HDZ;Homeobox;should +C4HDZ;START;should +C4HDZ;MEKHLA;should not +HD_PLINC;ZF-HD_dimer;should +HD_WOX;WOX_HD;should +HD_DDT;Homeobox;should +HD_DDT;DDT;should +HD_DDT;WHIM1;should +HD_DDT;WSD;should +HD_PHD;PHD;should +HD_PHD;Homeobox;should +HD_PINTOX;Homeobox;should +HD_PINTOX;PINTOX;should +HD_TALE_BEL;Homeobox_KN;should +HD_TALE_BEL;BEL;should +HD_TALE_KNOX1;Homeobox_KN;should +HD_TALE_KNOX1;KNOX1;should +HD_TALE_KNOX1;KNOX2;should +HD_TALE_KNOX1;KNOXC;should not +HD_TALE_KNOX2;Homeobox_KN;should +HD_TALE_KNOX2;KNOX1;should +HD_TALE_KNOX2;KNOX2;should +HD_TALE_KNOX2;KNOXC;should +HD-other;EIN3;should not +HD-other;Homeobox;should +HD-other;bZIP_1;should not +HD-other;WOX_HD;should not +HD-other;PINTOX;should not +HD-other;PHD;should not +HD-other;BEL;should not +HD_TALE;Homeobox_KN;should +HD_TALE;BEL;should not +HD_TALE;KNOX1;should not +HD_TALE;KNOX2;should not +HMG;ARID;should not +HMG;HMG_box;should +HMG;YABBY;should not +HRT;HRT;should +HSF;HSF_DNA-bind;should +IWS1;Med26;should +Jumonji_PKDM7;JmjC;should +Jumonji_PKDM7;JmjN;should +Jumonji_PKDM7;zf-C5HC2;should +Jumonji_PKDM7;FYRN;should +Jumonji_PKDM7;FYRC;should +Jumonji_Other;JmjC;should +LFY;FLO_LFY;should +LIM;two_or_more_LIM;should +LUG;LUFS_Domain;should +MADS;SRF-TF;should +MADS;K-box;should not +MADS_MIKC;SRF-TF;should +MADS_MIKC;K-box;should +MBF1;MBF1;should +Med6;Med6;should +Med7;Med7;should +mTERF;mTERF;should +MYB-2R;G2-like_Domain;should not +MYB-2R;Response_reg;should not +MYB-2R;trihelix;should not +MYB-2R;MYB-2R;should +MYB-3R;G2-like_Domain;should not +MYB-3R;Response_reg;should not +MYB-3R;trihelix;should not +MYB-3R;MYB-3R;should +MYB-4R;G2-like_Domain;should not +MYB-4R;Response_reg;should not +MYB-4R;trihelix;should not +MYB-4R;MYB-4R;should +MYB-related;ARID;should not +MYB-related;G2-like_Domain;should not +MYB-related;Myb_DNA-binding;should +MYB-related;Response_reg;should not +MYB-related;trihelix;should not +MYB-related;MYB-2R;should not +MYB-related;MYB-3R;should not +MYB-related;MYB-4R;should not +NAC;NAM;should +NZZ;NOZZLE;should +OFP;Ovate;should +PcG_EZ;CXC;should +PcG_EZ;SET;should +PcG_FIE;FIE_clipped_for_HMM;should +PcG_FIE;WD40;should +PcG_VEFS;VEFS-Box;should +PcG_VEFS;zf-C2H2;should not +PcG_MSI;WD40;should +PcG_MSI;CAF1C_H4-bd;should +PcG_MSI;FIE_clipped_for_HMM;should not +PHD;Myb_DNA-binding;should not +PHD;Alfin-like;should not +PHD;ARID;should not +PHD;DDT;should not +PHD;Homeobox;should not +PHD;JmjC;should not +PHD;JmjN;should not +PHD;PHD;should +PHD;SWIB;should not +PHD;zf-TAZ;should not +PHD;zf-MIZ;should not +PHD;zf-CCCH;should not +PHD;HMG_box;should not +PLATZ;PLATZ;should +Pseudo ARR-B;CCT;should +Pseudo ARR-B;Response_reg;should +Pseudo ARR-B;tify;should not +RB;RB_B;should +Rcd1-like;Rcd1;should +Rel;RHD_DNA_bind;should +RF-X;RFX_DNA_binding;should +RRN3;RRN3;should +Runt;Runt;should +S1Fa-like;S1FA;should +SAP;STER_AP;should +SBP;SBP;should +SET;zf-C2H2;should not +SET;TCR;should not +SET;CXC;should not +SET;PHD;should not +SET;Myb_DNA-binding;should not +SET;SET;should +Sigma70-like;Sigma70_r2;should +Sigma70-like;Sigma70_r3;should +Sigma70-like;Sigma70_r4;should +Sin3;PAH;should +Sin3;WRKY;should not +Sir2;SIR2;should +SOH1;Med31;should +SRS;DUF702;should +SWI/SNF_BAF60b;SWIB;should +SWI/SNF_SNF2;AP2;should not +SWI/SNF_SNF2;PHD;should not +SWI/SNF_SNF2;SNF2_N;should +SWI/SNF_SNF2;zf-CCCH;should not +SWI/SNF_SNF2;Myb_DNA-binding;should not +SWI/SNF_SNF2;HMG_box;should not +SWI/SNF_SWI3;SWIRM;should +SWI/SNF_SWI3;Myb_DNA-binding;should +TEA;TEA;should +TFb2;Tfb2;should +tify;tify;should +TRAF;BTB;should +TRAF;zf-TAZ;should not +Trihelix;trihelix;should +TUB;Tub;should +ULT;ULT_Domain;should +VARL;VARL;should +VOZ;VOZ_Domain;should +Whirly;Whirly;should +WRKY;WRKY;should +Zinc finger, AN1 and A20 type;zf-AN1;should +Zinc finger, AN1 and A20 type;zf-C2H2;should not +Zinc finger, MIZ type;zf-MIZ;should +Zinc finger, MIZ type;zf-C2H2;should not +Zinc finger, ZPR1;zf-ZPR1;should +Zn_clus;Zn_clus;should +ALOG;ALOG;should +C2H2;C2H2-IDD;should not +C2H2_IDD;C2H2-IDD;should +C2H2_IDD;zf-C2H2;should +MYST;zf-MYST;should +CBP;CBP;should +CBP;zf-TAZ;should +CBP;BTB;should not +TAFII250;DUF3591;should +LOB1;bZIP_1;should not +LOB1;bZIP_2;should not +LOB1;DUF260;should +LOB1;HLH;should not +LOB1;Homeobox;should not +LOB2;LOB2;should +LDL/FLD;SWIRM;should +LDL/FLD;Myb_DNA-binding;should not +ADA2;zz-ADA2;should +ADA2;Myb_DNA-binding;should +RKD;RWP-RK;should +RKD;NLP;should not +NLP;RWP-RK;should +NLP;NLP;should +CRF;CRF;should +CRF;AP2;should +GIY_YIG;GIY_YIG;should +ZPR;ZPR;should +HD-LD;LD;should +HD-NDX;NDX;should +HD-SAWADEE;SAWADEE;should +C1HDZ;C1HDZ;should +C1HDZ;Homeobox;should +C1HDZ;START;should not +C1HDZ;MEKHLA;should not +C2HDZ;C2HDZ;should +C2HDZ;Homeobox;should +C2HDZ;START;should not +C2HDZ;MEKHLA;should not
--- a/tapscan_script_v74.pl Wed Feb 14 13:54:16 2024 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,916 +0,0 @@ -#!/usr/bin/perl -use strict; -use warnings; - -# Written by Gerrit Timmerhaus (gerrit.timmerhaus@biologie.uni-freiburg.de). -# Changes included by Kristian Ullrich, Per Wilhelmsson and Romy Petroll. - -# Script to extract all detected domains out of a hmmsearch results file and classify the families of all used proteins based on these domains. -# The classification depends on a table which contains all known classification rules for the protein families of interest and on specific coverage values defined for every domain. -# The script provides three outputs, namely output.1, output.2 and output.3. The output files are tables in ";"-delimited format. -# The structure of output.1 is: "sequence ID ; TAP family ; number of classifications ; domains". -# Output.3 shares in principle the same structure as output.1, except that subfamilies are considered. ("sequence ID ; TAP family ; Subfamily ; number of classifications ; domains") -# The superior TAP family is specified first, followed by the subfamily. If a TAP family has no subfamily, the TAP family is specified first and then a "-". -# The structure of output.2 is: "TAP family";"number of detected proteins". -# More than one entry for a protein is possible because the classification rules may allow more than one classification. -# -# The script must be startet with the arguments <hmmsearch output file> <classification rules> <output classifications file> <output family statistics file> <output subfamily classifications file> <"filter" if desired> - -if (!@ARGV or ($ARGV [0] eq "-h") or ($ARGV [0] eq "-help")) { - print "Usage: extract.and.classify.pl <hmmsearch output file> <classification rules> <output classifications file> <output family statistics file> <output subfamily classifications file> <\"filter\" (if desired)>\n\n"; - exit; -} - -# hmmsearch_output: domtblout file -my $hmmsearch_output = $ARGV [0]; -# decision_table: rules file -my $decision_table = $ARGV [1]; -# family_classifications: output.1 -my $family_classifications = $ARGV [2]; -# family_statistics: output.2 -my $family_statistics = $ARGV [3]; -# subfamily_classifications: output.3 -my $subfamily_classifications = $ARGV [4]; -# domspec_cuts: coverage values file -my $domspec_cuts = $ARGV [5]; -# gene_model_filte: filter for ARATH and ORYSA -my $gene_model_filter = $ARGV [6]; - -if ($family_statistics eq "") { - print "Usage: extract.and.classify.pl <hmmsearch output file> <classification rules> <output classifications file> <output family statistics file> <output subfamil classifications file> <\"filter\" (if desired)>\n\n"; - exit; -} - -if ($gene_model_filter and $gene_model_filter eq "filter") { - print "\nGene model filter is activated. It only works for TAIR (Arabidopsis) and TIGR (Rice) proteins up to now\n"; -} - -# Array where the $hmmsearch-output/domtblout file will be stored -my @output = (); -# Array with domain-specific coverage values -my @cuts = (); -# Array with rules -my @dec_table = (); -# Counter for the number of detected domains in the hmmsearch output file -my $entry_counter = 0; -# Containes the actual result for a query sequence -my $akt_entry = ""; -# Used to define query entry to ignore similar domains -my $whole_entry = ""; -# Includes the final entries after ignoring similar domains -my @results_of_extraction = (); -# Used to define query entry to ignore similar domains -my $extracted_domain = ""; -# Used to define query entry to ignore similar domains -my $present = ""; -# Used to define query entry to ignore similar domains -my $protein = ""; - -my $lek = ""; - -############################################ -### 1. Read in the hmmsearch output file ### -############################################ - -print "\n*** reading in $hmmsearch_output ***\n\n"; - -@output = get_file_data("$hmmsearch_output"); - -print "*** Parsing $hmmsearch_output ***\n\n"; - -# If wrong format exit the program, ninth row from the end -if ($output [-9] !~ /^# Program: hmmsearch.*/) { - print "wrong file format. $hmmsearch_output has to be a hmmsearch output file\n\n"; - exit; -} - -### 1.1 Trim input file (hmmsearch) to fit script. Erase lines starting with #. -@output = grep !/^#.*/, @output; - -# Read domain-specific coverage values file -# domname cut-off -# EIN3 0.5874125874 - -open(FILENAME,$domspec_cuts); -my %cuts = map { chomp; split /\t/ } <FILENAME>; - -### 1.2 Use first (prot name), third (domain name), eight (dom evalue), fourteenth (# overlapping) and fifteenth (# domain) column of data input(array element). -foreach my $output (@output) { - $output =~ /^(\S+)\s+\S+\s+\S+\s+(\S+)\s+\S+\s+(\S+)\s+\S+\s+\S+\s+\S+\s+\S+\s+\S+\s+(\S+)\s+\S+\s+\S+\s+\S+\s+(\S+)\s+(\S+)\s+.*/; - $lek = (($6-$5)+1)/$3; - $output = $1."\t".$2."\t".$4."\t".$lek; - } - -# Stucture: -# ARATHwo_AT2G03430.1 Ank 1.8e-08 0.93 -# ARATHwo_AT2G03430.1 Ank 7.8e-10 0.95 -# ARATHwo_AT2G03430.1 Ank 1.2e-08 0.93 -# ARATHwo_AT2G03430.1 Ank 1.2e-08 0.63 - -################################ -### 2. Modify the input data ### -################################ - -### 2.1 Run through length_cut_off loop (throws out entries below coverage values) -my @cutarray; -foreach my $output (@output) { - $output =~ /^(\S+)\t+(\S+)\t(\S+)\t(\S+)/; - if ($4 > $cuts{$2}) { - push (@cutarray, $output); - } -} - -# End up with (prot name) (domain name) (dom evalue) (overlapping) - -# Structure: -# ARATHwo_AT2G03430.1 Ank 1.8e-08 0.93 -# ARATHwo_AT2G03430.1 Ank 7.8e-10 0.95 -# ARATHwo_AT2G03430.1 Ank 1.2e-08 0.93 - - -### 2.2 Retain hit with the lowest evalue - -# String for the previous protein name -my $old_prot = ""; -# Coordinates for array filling -my $x = -1; -my $y = 1; -my $scoreeval; -my @newarray; -foreach my $cutarray (@cutarray) { - $cutarray =~ /^(\S+)\t+(\S+)\t(\S+)\t(\S+)/; - $akt_entry = $1.$2; - # If the protein contains more than one of the same domain - if ($old_prot eq $akt_entry) { - if ($3>$scoreeval) { - $y++; - $newarray[$x] = $1."\t".$2."\t".$3."\t".$y; - } - else { - $y++; - $newarray[$x] = $1."\t".$2."\t".$scoreeval."\t".$y; - } - } - # If the protein is new reset $j and push the entry in the next array - else { - $y = 1; - $x++; - $newarray[$x] = $1."\t".$2."\t".$3."\t".$y; - } - $old_prot = $akt_entry; - $scoreeval = $3 - -} -# Final structure: -# ARATHwo_AT2G03430.1 Ank 7.8e-10 3 - -### 2.3 Then sort @output primarily on prot name($1) and then secondarily on eval($3) to fit the "similar"-procedure. -@newarray = sort { (split '\t', $a)[0] cmp (split '\t', $b)[0] || (split '\t', $a)[2] <=> (split '\t', $b)[2] } @newarray; - -# Flags for merged domains -my $ARR_B_counter = 0; -my $bZIP_counter = 0; -my $ET_counter = 0; - -# Flags for similar domain omission -my $WD40domain = 0; # modified for FIE rule -my $FIEdomain = 0; # --------'''''-------- -my $MYBdomain = 0; -my $G2domain = 0; -my $YBdomain = 0; -my $YCdomain = 0; -my $PHDdomain = 0; -my $Alfindomain = 0; -my $Dr1domain = 0; -my $GATAdomain = 0; -my $Dofdomain = 0; -my $C1domain = 0; - -foreach my $line (@newarray) { - -# Reset the flags for domain omission - if ($line !~ /^$protein.*$/ ) { - $WD40domain = 0; - $FIEdomain = 0; - $MYBdomain = 0; - $G2domain = 0; - $YBdomain = 0; - $YCdomain = 0; - $PHDdomain = 0; - $Alfindomain = 0; - $Dr1domain = 0; - $GATAdomain = 0; - $Dofdomain = 0; - $C1domain = 0; - - } - - # Get the Query entry - $line =~ /^(\S+)\s+(\S+)\s+(\S+)\s+(\S+)/; - $protein = "$1"; - $extracted_domain = $2; - $present = "$4"; # FIX! number for many Myb - $whole_entry = $1.";".$2.";".$3.";".$4; - - - # Ignore FIE if it is not better scored than WD40 - if ($extracted_domain eq "WD40" and $FIEdomain == 0) {$WD40domain = 1;} - - # Ignore similar domains part 1 - - if ($extracted_domain eq "Myb_DNA-binding" and $G2domain == 0) {$MYBdomain = 1;} - if ($extracted_domain eq "G2-like_Domain" and $MYBdomain == 0) {$G2domain = 1;} - - if ($extracted_domain eq "PHD" and $Alfindomain == 0 and $C1domain == 0) {$PHDdomain = 1;} - if ($extracted_domain eq "Alfin-like" and $PHDdomain == 0 and $C1domain == 0) {$Alfindomain = 1;} - if ($extracted_domain eq "C1_2" and $Alfindomain == 0 and $PHDdomain == 0) {$C1domain = 1;} - - if ($extracted_domain eq "GATA" and $Dofdomain == 0) {$GATAdomain = 1;} - if ($extracted_domain eq "zf-Dof" and $GATAdomain == 0) {$Dofdomain = 1;} - - # Ignore similar domains part 2 - - if ($extracted_domain eq "FIE_clipped_for_HMM" and $WD40domain == 1) {next;} - if ($extracted_domain eq "Myb_DNA-binding" and $G2domain == 1) {next;} - if ($extracted_domain eq "G2-like_Domain" and $MYBdomain == 1) {next;} - - if ($extracted_domain eq "PHD" and ($Alfindomain == 1 or $C1domain == 1)) {next;} - if ($extracted_domain eq "Alfin-like" and ($PHDdomain == 1 or $C1domain ==1)) {next;} - if ($extracted_domain eq "C1_2" and ($PHDdomain == 1 or $Alfindomain == 1)) {next;} - - if ($extracted_domain eq "GATA" and $Dofdomain == 1) {next;} - if ($extracted_domain eq "zf-Dof" and $GATAdomain == 1) {next;} - - # Save the entry - push @results_of_extraction, $whole_entry; - $entry_counter++; -} -print "$entry_counter domain matches were found in $hmmsearch_output\n\n"; - -# Sort the entries: -my @sorted_results_of_extraction = @results_of_extraction; -print "*** Preparing hmmsearch results for classification ***\n\n"; - -# This array will be filled with the sorted proteins -my $array_of_arrays = []; - -# String for the previous protein name -my $old_entry = ""; - -# Coordinates for array filling -my $i = -1; -my $j = 0; - -@dec_table = get_file_data("$decision_table"); - -if ($dec_table [0] !~ /^[^;]+;[^;]+;[^;]/) { - print "wrong file format. $decision_table has to be in the format family;domain;type\n\n"; - exit; -} - -# Formate the entries -foreach my $domain_entry (@sorted_results_of_extraction) { - - $domain_entry =~ /^([^;]+);/; - $akt_entry = $1; - # If the protein has more than one domain push it behind the old entry - if ($old_entry eq $akt_entry) { - $j++; - $array_of_arrays->[$i][$j] = $domain_entry; - } - - # If the protein is new reset $j and push the entry in the next array - else { - $j = 0; - $i++; - $array_of_arrays->[$i][$j] = $domain_entry; - } - $old_entry = $akt_entry; - -} -$old_entry = ""; - -# Remember the number of entries to use them later -my $number_of_proteins = $i+1; -print "$number_of_proteins proteins were found in $hmmsearch_output\n"; -$number_of_proteins--; - -######################################### -### 3. Get the classification entries ### -######################################### - -# Make a list of all families in the classification rules file -my @liste_alle_familien = ('0_no_family_found'); - -my $ARR_B_in_list = 0; -my $bZIP_in_list = 0; -my $ET_in_list = 0; - -my $array_of_classifications = []; -$i = -1; -$j = 0; - -my @classifications_input = get_file_data("$decision_table"); - -# Sort the classifications -my @classifications = sort @classifications_input; - - -foreach my $classification (@classifications) { - $classification =~ /^([^;]+);/; - $akt_entry = $1; - - # If the family has more than one domain entry push it behind the old entry - if ($old_entry eq $akt_entry) { - - $j++; - $array_of_classifications->[$i][$j] = $classification; - } - - # If the family is new push it in the next array and reset $j - else { - $j = 0; - $i++; - $array_of_classifications->[$i][$j] = $classification; - # Add family to list of all families in the classification rules - push(@liste_alle_familien,$akt_entry); - - # Add hardcoded "OR" defined families to list of families if subfamily is present - # Do this just once for each family - if (($akt_entry eq "GARP_ARR-B_Myb") or ($akt_entry eq "GARP_ARR-B_G2")) { - if ($ARR_B_in_list == 0) { - push(@liste_alle_familien,"GARP_ARR-B"); - $ARR_B_in_list++; - }} - if (($akt_entry eq "bZIP1") or ($akt_entry eq "bZIP2") or ($akt_entry eq "bZIPAUREO") or ($akt_entry eq "bZIPCDD")) { - if ($bZIP_in_list == 0) { - push(@liste_alle_familien,"bZIP"); - $bZIP_in_list++; - }} - if (($akt_entry eq "HRT") or ($akt_entry eq "GIY_YIG")) { - if ($ET_in_list == 0) { - push(@liste_alle_familien,"ET"); - $ET_in_list++; - }} - } - $old_entry = $akt_entry; -} - -my $number_of_classifications = $i+1; -print "$number_of_classifications different families were found in $decision_table\n\n"; -$number_of_classifications--; - -print "*** begin classification ***\n\n"; - -# Create the output files (output.1 and output.3) -my $outputfile = "$family_classifications"; -my $subfamilyoutput = "$subfamily_classifications"; - -unless (open(FAMILY_CLASSIFICATIONS, ">$outputfile")) { - print "Cannot open file \"$outputfile\" to write to!!\n\n"; - exit; -} - -unless (open(SUBFAMILY_CLASSIFICATIONS, ">$subfamilyoutput")) { - print "Cannot open file \"$subfamilyoutput\" to write to!!\n\n"; - exit; -} - -print FAMILY_CLASSIFICATIONS "family classifications for $hmmsearch_output\n"; -print SUBFAMILY_CLASSIFICATIONS "family classifications for $hmmsearch_output\n"; - -# Counter for classified families -my $classified_families = 0; -# Counter for unclassified families -my $unclassified_families = 0; -# Counter for the different amount of entries in classification possibilities -my @entry_counter = []; -# Counter for the possible classifications for the current protein -my $possibilities = 0; -# For later famliy statistics -my @family_list = []; -# This array will be filled with the result. Later written in output.1 and output.3 -my @family_classifications_file = []; -my @subfamily_classifications_file = []; - -# One time for every protein in array of arrays -for (my $ii = 0; $ii <= $number_of_proteins; $ii++) { - my $jj = 0; - $possibilities = 0; - - # Flag to mark that any family was found for the current protein - my $member_found = 0; - - # In this string all the domains of one protein will be stored - # The ; signs are important to mark a single entry (for example to differ AP2 and TF_AP2) - my $domains = ";"; - - # Get the domains in all elements of the current protein array: - while ($array_of_arrays->[$ii][$jj]) { - - $array_of_arrays->[$ii][$jj] =~ /([^;]+);[^;]+;(\d+)$/; - # For discrimination between MYB and MYB-related. If more than 1 MYB domains was found replaced "Myb_DNA-binding" with "two_or_more_Myb_DNA-binding" - # Same for LIM - if ($1 eq "Myb_DNA-binding" and $2 eq 2) { - $domains .= "MYB-2R;Myb_DNA-binding;"; - } - elsif ($1 eq "Myb_DNA-binding" and $2 eq 3) { - $domains .= "MYB-3R;Myb_DNA-binding;"; - } - elsif ($1 eq "Myb_DNA-binding" and $2 eq 4) { - $domains .= "MYB-4R;Myb_DNA-binding;"; - } - elsif ($1 eq "LIM" and $2 > 1) { - $domains .= "two_or_more_LIM;LIM;"; - } - else { - $domains .= "$1;"; - } - $jj++; - } - -######################### -### 4. Classification ### -######################### - -# Get the name of the actual protein for the output after classification: -$array_of_arrays->[$ii][0] =~ /^([^;]+);/; -$akt_entry = $1; - -# Variables for double classification decisions: - -# Here the shoulds for the actual family will be stored -my $shoulds_act_family = ";"; -# Here the shoulds of the previous CLASSIFIED family will be stored -my $shoulds_prev_family = ""; - -# This loop will be made for every family entry in output.1 and output.3 -for ($i = 0;$i <= $number_of_classifications; $i++) { - $j = 0; - - # Flag to mark if the protein is a possible member of the current family - my $possible_member = 0; - - # Reset the domain entries for the current family - $shoulds_act_family = ";"; - - # This loop will be made for every domain classification entry for the current family - while ($array_of_classifications->[$i][$j]) { - - # For "should" entries - if ($array_of_classifications->[$i][$j] =~ /;should$/) { - - # Get the domain from the classification entry - $array_of_classifications->[$i][$j] =~ /^[^;]+;([^;]+);[^;]+/; - - my $domain_classification_entry = $1; - # Check if the protein might be a member of the current family. If yes check the next entry - if ($domains =~ /;$domain_classification_entry;/) { - $possible_member = 1; - $j++; - # All matched domains for the current family are stored here: - $shoulds_act_family .= "$domain_classification_entry;"; - next; - } - $possible_member = 0; - $j++; - last; - } - - # For "should not" entries - if ($array_of_classifications->[$i][$j] =~ /;should not$/) { - - # Get the domain from the classification entry - $array_of_classifications->[$i][$j] =~ /^[^;]+;([^;]+);[^;]+/; - - # Check if the protein has the domain. If yes go to the next family - my $domain_check = $1; - if ($domains =~ /;$domain_check;/) { - $possible_member = 0; - last; - } - # If not check the next entry - else { - $j++; - next; - } - } - - # This happens when there is no "should" or "should not" entry at the right position - print "error in classification entry $array_of_classifications->[$i][$j]\n" - } - - # If the check was successful print the found classification - if ($possible_member) { - - $array_of_classifications->[$i][0] =~ /^([^;]+);/; - my $currentFamily = $1; - $possibilities++; - # The double classified proteins solution: - if ($possibilities > 1) { - - # Fill an array with the current domains in $domains - my @domains_array = []; - my $number_of_domains = ($domains =~ tr/;/;/)-1; - my $tempdomains = $domains; - for (my $domain_counter = 0;$domain_counter != $number_of_domains;$domain_counter++) { - - $tempdomains =~ /^;([^;]+);/; - $domains_array[$domain_counter] = $1; - # Remove the inserted domain from domains - $tempdomains =~ s/^;[^;]+;/;/; - } - # Compare the two possible families (actual and previous family) - my $array_counter = 0; - while ($array_counter < $number_of_domains) { - if ($shoulds_prev_family =~ /$domains_array[$array_counter]/) { - if ($shoulds_act_family =~ /$domains_array[$array_counter]/) { - - # If actual domain is in previous and actual family go to next domain - $array_counter++; - next; - - } - # Protein was right classified in the previous classification. The actual entry will be ignored - - $possibilities--; - last; - } - - # If actual domain is in the actual family but not in the previous. The actual classification is right - if ($shoulds_act_family =~ /$domains_array[$array_counter]/) { - - pop @family_classifications_file; - pop @subfamily_classifications_file; - - # Merge "GARBP_ARR-B_Myb" and "GARP_ARR-B_G2" to "GARP_ARR-B" - if ( ($currentFamily eq "GARP_ARR-B_Myb") or ($currentFamily eq "GARP_ARR-B_G2")){ - - push @family_classifications_file, "$akt_entry;GARP_ARR-B;$possibilities$domains\n"; - push @subfamily_classifications_file, "$akt_entry;GARP_ARR-B;-;$possibilities$domains\n"; - if ($ARR_B_counter == 0) { - push @liste_alle_familien, 'GARP_ARR-B'; - $ARR_B_counter++; - } - } - elsif ( ($currentFamily eq "bZIP1") or ($currentFamily eq "bZIP2") or ($currentFamily eq "bZIPAUREO") or ($currentFamily eq "bZIPCDD")) { - - push @family_classifications_file, "$akt_entry;bZIP;$possibilities$domains\n"; - push @subfamily_classifications_file, "$akt_entry;bZIP;-;$possibilities$domains\n"; - if ($bZIP_counter == 0) { - push @liste_alle_familien, 'bZIP'; - $bZIP_counter++; - } - } - elsif ( ($currentFamily eq "HRT") or ($currentFamily eq "GIY_YIG")) { - push @subfamily_classifications_file, "$akt_entry;ET;-;$possibilities$domains\n"; - push @family_classifications_file, "$akt_entry;ET;$possibilities$domains\n"; - if ($ET_counter == 0) { - push @liste_alle_familien, 'ET'; - $ET_counter++; - } - } - else { - - push @family_classifications_file, "$akt_entry;$currentFamily;$possibilities$domains\n"; - push @subfamily_classifications_file, "$akt_entry;$currentFamily;-;$possibilities$domains\n"; - } - $possibilities--; - last; - - } - - $array_counter++; - } - - } - # Normal enty: if the family is classified for one family write it in the array - else { - - # Store the matched domains of the actual family for respectively later doubly classification decision - $shoulds_prev_family = $shoulds_act_family; - # Store the entry in the output array - # Merge "GARBP_ARR-B_Myb" and "GARP_ARR-B_G2" to "GARP_ARR-B" - # Add GARP_ARR-B to @liste_alle_familien ONCE - - if ( ($currentFamily eq "GARP_ARR-B_Myb") or ($currentFamily eq "GARP_ARR-B_G2")){ - push @family_classifications_file, "$akt_entry;GARP_ARR-B;$possibilities$domains\n"; - push @subfamily_classifications_file, "$akt_entry;GARP_ARR-B;-;$possibilities$domains\n"; - if ($ARR_B_counter == 0) { - push @liste_alle_familien, 'GARP_ARR-B'; - $ARR_B_counter++; - } - } - elsif (($currentFamily eq "bZIP1") or ($currentFamily eq "bZIP2") or ($currentFamily eq "bZIPAUREO") or ($currentFamily eq "bZIPCDD")){ - push @family_classifications_file, "$akt_entry;bZIP;$possibilities$domains\n"; - push @subfamily_classifications_file, "$akt_entry;bZIP;-;$possibilities$domains\n"; - if ($bZIP_counter == 0) { - push @liste_alle_familien, 'bZIP'; - $bZIP_counter++; - } - } - elsif (($currentFamily eq "HRT") or ($currentFamily eq "GIY_YIG")){ - push @family_classifications_file, "$akt_entry;ET;$possibilities$domains\n"; - push @subfamily_classifications_file, "$akt_entry;ET;-;$possibilities$domains\n"; - if ($ET_counter == 0) { - push @liste_alle_familien, 'ET'; - $ET_counter++; - } - } - else { - - push @family_classifications_file, "$akt_entry;$currentFamily;$possibilities$domains\n"; - push @subfamily_classifications_file, "$akt_entry;$currentFamily;-;$possibilities$domains\n"; - } - - $classified_families++; - } - - # Flag to mark that any family was found for the current protein - $member_found = 1; - } - - } - - if ($possibilities == 0) { - - # If no family was found for the current protein give this out - push @family_classifications_file, "$akt_entry;0_no_family_found;0$domains\n"; - push @subfamily_classifications_file, "$akt_entry;0_no_family_found;-;0$domains\n"; - $unclassified_families++; - } - - # Increment the number of possible members in the array at the right position - else { - - my $temp = $entry_counter[$possibilities]; - $temp++; - $entry_counter[$possibilities] = $temp; - } - -} -# Define for which TAP families also subfamilies may be present: - -foreach my $subfamily_classifications_file (@subfamily_classifications_file) { - - # If "C2H2" was assigned to a sequence, write "C2H2;C2H2" to output.3 as TAP family and subfamily, respectively. - if ($subfamily_classifications_file =~ /C2H2/ ) { - $subfamily_classifications_file =~ s/C2H2;-/C2H2;C2H2/ - } - # If "C2H2_IDD" was assigned to a sequence, write "C2H2;C2H2_IDD" to output.3 as TAP family and subfamily, respectively. - if ($subfamily_classifications_file =~ /C2H2_IDD/ ) { - $subfamily_classifications_file =~ s/C2H2_IDD;-/C2H2;C2H2_IDD/ - } - if ($subfamily_classifications_file =~ /bHLH/ ) { - $subfamily_classifications_file =~ s/bHLH;-/bHLH;bHLH/ - } - if ($subfamily_classifications_file =~ /bHLH_TCP/ ) { - $subfamily_classifications_file =~ s/bHLH_TCP;-/bHLH;bHLH_TCP/ - } - if ($subfamily_classifications_file =~ /NF-YA/ ) { - $subfamily_classifications_file =~ s/NF-YA;-/NFY;NF-YA/ - } - if ($subfamily_classifications_file =~ /NF-YB/ ) { - $subfamily_classifications_file =~ s/NF-YB;-/NFY;NF-YB/ - } - if ($subfamily_classifications_file =~ /NF-YC/ ) { - $subfamily_classifications_file =~ s/NF-YC;-/NFY;NF-YC/ - } - if ($subfamily_classifications_file =~ /C1HDZ/ ) { - $subfamily_classifications_file =~ s/C1HDZ;-/HDZ;C1HDZ/ - } - if ($subfamily_classifications_file =~ /C2HDZ/ ) { - $subfamily_classifications_file =~ s/C2HDZ;-/HDZ;C2HDZ/ - } - if ($subfamily_classifications_file =~ /C3HDZ/ ) { - $subfamily_classifications_file =~ s/C3HDZ;-/HDZ;C3HDZ/ - } - if ($subfamily_classifications_file =~ /C4HDZ/ ) { - $subfamily_classifications_file =~ s/C4HDZ;-/HDZ;C4HDZ/ - } - if ($subfamily_classifications_file =~ /MYST/ ) { - $subfamily_classifications_file =~ s/MYST;-/HAT;MYST/ - } - if ($subfamily_classifications_file =~ /CBP/ ) { - $subfamily_classifications_file =~ s/CBP;-/HAT;CBP/ - } - if ($subfamily_classifications_file =~ /TAFII250/ ) { - $subfamily_classifications_file =~ s/TAFII250;-/HAT;TAFII250/ - } - if ($subfamily_classifications_file =~ /GNAT/ ) { - $subfamily_classifications_file =~ s/GNAT;-/HAT;GNAT/ - } - if ($subfamily_classifications_file =~ /LOB1/ ) { - $subfamily_classifications_file =~ s/LOB1;-/LBD;LOB1/ - } - if ($subfamily_classifications_file =~ /LOB2/ ) { - $subfamily_classifications_file =~ s/LOB2;-/LBD;LOB2/ - } - if ($subfamily_classifications_file =~ /MYB-related/ ) { - $subfamily_classifications_file =~ s/MYB-related;-/MYB;MYB-related/ - } - if ($subfamily_classifications_file =~ /SWI\/SNF_SWI3/ ) { - $subfamily_classifications_file =~ s/SWI\/SNF_SWI3;-/MYB-related;SWI\/SNF_SWI3/ - } - if ($subfamily_classifications_file =~ /MYB-2R/ ) { - $subfamily_classifications_file =~ s/MYB-2R;-/MYB;MYB-2R/ - } - if ($subfamily_classifications_file =~ /MYB-3R/ ) { - $subfamily_classifications_file =~ s/MYB-3R;-/MYB;MYB-3R/ - } - if ($subfamily_classifications_file =~ /MYB-4R/ ) { - $subfamily_classifications_file =~ s/MYB-4R;-/MYB;MYB-4R/ - } - if ($subfamily_classifications_file =~ /RKD/ ) { - $subfamily_classifications_file =~ s/RKD;-/RWP-RK;RKD/ - } - if ($subfamily_classifications_file =~ /NLP/ ) { - $subfamily_classifications_file =~ s/NLP;-/RWP-RK;NLP/ - } - if ($subfamily_classifications_file =~ /AP2/ ) { - $subfamily_classifications_file =~ s/AP2;-/AP2;AP2/ - } - if ($subfamily_classifications_file =~ /CRF/ ) { - $subfamily_classifications_file =~ s/CRF;-/AP2;CRF/ - } -} - -### 4.1 Filter for gene models - -# Filter for gene models. All alternative splice variants will be removed from Arabidopsis and rice entries -shift @family_classifications_file; -shift @subfamily_classifications_file; -# Sort the array for filter and a sorted output -@family_classifications_file = sort @family_classifications_file; -@subfamily_classifications_file = sort @subfamily_classifications_file; -if ($gene_model_filter and $gene_model_filter eq "filter") { - print "*** filtering gene models: removing splice variants ***\n"; - - # temporary array for the filtered entries - my @filtered_entries = []; - my $models_removed_counter = 0; - - my $prev_entry = ""; - foreach my $entry_line (@family_classifications_file) { - if ($entry_line !~ /^([^.]+).\d+/) { - print "Bad format for filtering splice variants. Use \"filter\" only for TAIR (Aradopsis) or TIGR (Rice) files.\n\n"; - exit; - } - if ($1 eq $prev_entry) { - $models_removed_counter++; - next; - } - $prev_entry = $1; - push @filtered_entries, $entry_line; - } - print " -> $models_removed_counter gene models were removed\n\n"; - @family_classifications_file = @filtered_entries; - shift @family_classifications_file; -} - -# Print the results in the array to output.1 and output.3 and push the names of the TAP families and Subfamilies into $family_list to create output.2 -foreach my $fcf_line (@family_classifications_file) { - $fcf_line =~ /^[^;]+;([^;]+)/; - #push @family_list, "$1"; - print FAMILY_CLASSIFICATIONS "$fcf_line"; -} -close (FAMILY_CLASSIFICATIONS); - -foreach my $fcf_line (@subfamily_classifications_file) { - $fcf_line =~ /^[^;]+;([^;]+);([^;]+)/; - if ($2 eq "-") { - push @family_list, "$1"; - } - else { - push @family_list, "$2"; - } - print SUBFAMILY_CLASSIFICATIONS "$fcf_line"; -} - -close (SUBFAMILY_CLASSIFICATIONS); - -print "*** calculating the family statistics and write it in $family_statistics ***\n\n"; - -################################## -### 5. Create the output files ### -################################## - -my $statistics_outputfile = "$family_statistics"; - -unless (open(FAMILY_STATISTICS, ">$statistics_outputfile")) { - print "Cannot open file \"$statistics_outputfile\" to write to!!\n\n"; - exit; -} - -# Count the family entries -my @output_family_statistics = (); -my @gefundene_familien = (); -my $family_counter = 1; - -shift @family_list; -@family_list = sort @family_list; - - -my $old_family = ""; -push @family_list, 'BAD FIX'; # Makes to loop go through every fam and stops at non fam(BAD FIX). - -foreach my $line (@family_list) { - if ($line eq $old_family) { - $family_counter++; - } - elsif ($old_family ne "") { - push (@output_family_statistics,"$old_family;$family_counter\n"); - # Add all found families to a list of found families - push (@gefundene_familien,"$old_family"); - $family_counter=1; - } - $old_family = $line; -} - -my %hash = (); - -# Put all families from the classifictaion ruled and all found families in a hash -foreach my $element (@gefundene_familien,@liste_alle_familien) {$hash{$element}++;} - -# Remove merged families -delete $hash{'GARP_ARR-B_Myb'}; -delete $hash{'GARP_ARR-B_G2'}; -delete $hash{'bZIP1'}; -delete $hash{'bZIP2'}; -delete $hash{'bZIPAUREO'}; -delete $hash{'bZIPCDD'}; -delete $hash{'HRT'}; -delete $hash{'GIY_YIG'}; - -# Add all not found families from the classification rules to @output_family_statistics -# With zero as number of families found - -foreach my $element (keys %hash) { - if (($hash{$element} == 1) and ( $element eq "0_no_family_found")) { - push (@output_family_statistics,"$element;$unclassified_families\n"); - } - if (($hash{$element} == 1) and ($element ne "0_no_family_found")) { - push (@output_family_statistics,"$element;0\n"); - } -} - -# Sort @output_family_statistics caseinsensitive-alphabetically -my @sortierte_statistik = sort {lc $a cmp lc $b} @output_family_statistics; - -# Print headline to @sortierte_statistik -unshift (@sortierte_statistik,"family statistics for $hmmsearch_output\n"); - -print FAMILY_STATISTICS @sortierte_statistik; - -# Print FAMILY_STATISTICS "$old_family;$family_counter\n"; - -close (FAMILY_STATISTICS); - -################################################# -### 6. Give out some statistical informations ### -################################################# - -$entry_counter [0] = 0; -my $sum = 0; -foreach my $entry (@entry_counter) { - $sum += $entry; -} -print "$classified_families classifications were found for $sum proteins.\n"; -print "This classifications are divided in:\n"; -my $count = 0; -foreach my $element (@entry_counter) { - if ($count != 0) { - print "$element proteins were classified for $count"; - if ($count == 1) {print " family\n";} - else {print " different families\n";} - } - $count++; -} -print "\n$unclassified_families proteins could not be classified\n\n"; - -print "*** The results were written in $family_classifications and $subfamily_classifications ***\n"; -print "*** done ***\n\n"; - -exit; - -sub get_file_data { - - my ($filename) = @_; - - use strict; - use warnings; - - my @filedata = (); - - unless( open(GET_FILE_DATA, $filename)) { - print STDERR "Cannot open file \"$filename\"n\n"; - exit; - } - - @filedata = <GET_FILE_DATA>; - - close GET_FILE_DATA; - - return @filedata; -} -
--- a/test-data/output.2.tsv Wed Feb 14 13:54:16 2024 +0000 +++ b/test-data/output.2.tsv Thu Feb 22 10:07:53 2024 +0000 @@ -57,13 +57,14 @@ HD-NDX 0 HD-other 0 HD-SAWADEE 0 -HD_BEL 0 HD_DDT 0 -HD_KNOX1 0 -HD_KNOX2 0 HD_PHD 0 HD_PINTOX 0 HD_PLINC 0 +HD_TALE 0 +HD_TALE_BEL 0 +HD_TALE_KNOX1 0 +HD_TALE_KNOX2 0 HD_WOX 0 HMG 0 HSF 0
--- a/test-data/output.domtbl.tsv Wed Feb 14 13:54:16 2024 +0000 +++ b/test-data/output.domtbl.tsv Thu Feb 22 10:07:53 2024 +0000 @@ -20,9 +20,9 @@ # Program: hmmsearch # Version: 3.3.2 (Nov 2020) # Pipeline mode: SEARCH -# Query file: /home/saskia/code/github/galaxyproject/tools-iuc/tools/tapscan/tapscan_domains_v12.txt -# Target file: /tmp/saskia/tmp4n93kzh1/files/b/7/b/dataset_b7b39fd6-41f1-440d-bfb7-da7a3a9f070e.dat -# Option settings: hmmsearch --domtblout domtblout.txt --cut_ga /home/saskia/code/github/galaxyproject/tools-iuc/tools/tapscan/tapscan_domains_v12.txt /tmp/saskia/tmp4n93kzh1/files/b/7/b/dataset_b7b39fd6-41f1-440d-bfb7-da7a3a9f070e.dat -# Current dir: /tmp/saskia/tmp4n93kzh1/job_working_directory/000/4/working -# Date: Mon Nov 13 14:57:28 2023 +# Query file: /home/saskia/code/github/bgruening/galaxytools/tools/tapscan/tapscan_domains_v13.txt.gz +# Target file: /tmp/saskia/tmpwta2cyq_/files/f/5/c/dataset_f5c238a3-8cc8-42f9-86b5-ebc2431be570.dat +# Option settings: hmmsearch --domtblout domtblout.txt --cut_ga /home/saskia/code/github/bgruening/galaxytools/tools/tapscan/tapscan_domains_v13.txt.gz /tmp/saskia/tmpwta2cyq_/files/f/5/c/dataset_f5c238a3-8cc8-42f9-86b5-ebc2431be570.dat +# Current dir: /tmp/saskia/tmpwta2cyq_/job_working_directory/000/4/working +# Date: Thu Feb 22 10:10:40 2024 # [ok]