| Next changeset 1:c4f865bd101a (2024-02-22) |
|
Commit message:
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/tapscan commit 2e1cd301fb38af8a1e9267fc60fcb5ca3c576aeb |
|
added:
tapscan.xml tapscan_coverage_values_v10.txt tapscan_domains_v12.txt.gz tapscan_rules_v81.txt tapscan_script_v74.pl test-data/PUBLIC_Ectocarpus-sp7_proteins_head.fa test-data/output.1.tsv test-data/output.2.tsv test-data/output.3.tsv test-data/output.domtbl.tsv |
| b |
| diff -r 000000000000 -r 196795831b6a tapscan.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tapscan.xml Wed Feb 14 13:54:16 2024 +0000 |
| [ |
| @@ -0,0 +1,126 @@ +<tool id="tapscan_classify" name="TAPScan Classify" version="4.74+galaxy0" profile="23.0"> + <description>Detect Transcription Associated Proteins (TAPs)</description> + <edam_topics> + <edam_topic>topic_0121</edam_topic> + </edam_topics> + <requirements> + <requirement type="package" version="3.3.2">hmmer</requirement> + <requirement type="package" version="5.26">perl</requirement> + <requirement type="package" version="4.8">sed</requirement> + </requirements> + <required_files> + <include type="literal" path="tapscan_script_v74.pl"/> + <include type="literal" path="tapscan_domains_v12.txt"/> + <include type="literal" path="tapscan_rules_v81.txt"/> + <include type="literal" path="tapscan_coverage_values_v10.txt"/> + </required_files> + <command detect_errors="aggressive"><![CDATA[ + +hmmsearch + --domtblout domtblout.txt + --cut_ga + '${__tool_directory__}/tapscan_domains_v12.txt.gz' + '$protein_fasta_in' + +&& + +perl '${__tool_directory__}/tapscan_script_v74.pl' + domtblout.txt + '${__tool_directory__}/tapscan_rules_v81.txt' + '$taps_detected' + '$taps_family_counts' + '$taps_detected_extra' + '${__tool_directory__}/tapscan_coverage_values_v10.txt' + +&& + +## make the outputs tab-separated for Galaxy compatibility +sed -i -e 's/;/\t/' -e 's/;/\t/' -e 's/;/\t/' -e "1d" '$taps_detected' && +sed -i -e 's/;/\t/g' -e "1d" '$taps_family_counts' && +sed -i -e 's/;/\t/4' -e 's/;/\t/' -e 's/;/\t/' -e 's/;/\t/' -e "1d" '$taps_detected_extra' && + +## add header lines for clarity +sed -i '1s/^/sequence ID\tTAP family\tnumber of classifications\tdomains\n/' '$taps_detected' && +sed -i '1s/^/TAP family\tnumber of detected proteins\n/' '$taps_family_counts' && +sed -i '1s/^/sequence ID\tTAP family\tsubfamily\tnumber of classifications\tdomains\n/' '$taps_detected_extra' + + ]]></command> + <inputs> + <param name="protein_fasta_in" type="data" format="fasta" optional="false" label="Proteins in FASTA format" help=""/> + <param name="domtblout" type="boolean" checked="false" label="Output the HMMer domain hits table?"/> + </inputs> + <outputs> + <data name="taps_detected" format="tabular" label="${tool.name} on ${on_string}: Detected TAPs - domains and assigned TAP family +for each gene ID"> + <actions> + <action name="column_names" type="metadata" default="sequence ID,TAP family,number of classifications,domains" /> + </actions> + </data> + <data name="taps_family_counts" format="tabular" label="${tool.name} on ${on_string}: Count - Summary of the number of +members for each TAP family"> + <actions> + <action name="column_names" type="metadata" default="TAP family,number of detected proteins" /> + </actions> + </data> + <data name="taps_detected_extra" format="tabular" label="${tool.name} on ${on_string}: Detected TAPs Extra - with subfamiliy information"> + <actions> + <action name="column_names" type="metadata" default="sequence ID,TAP family,subfamily,number of classifications,domains" /> + </actions> + </data> + <data name="domtbl" format="tabular" from_work_dir="domtblout.txt" label="${tool.name} on ${on_string}: HMMer Domain Hits Table"> + <filter>domtblout</filter> + </data> + </outputs> + <tests> + <test expect_num_outputs="3"> + <param name="protein_fasta_in" value="PUBLIC_Ectocarpus-sp7_proteins_head.fa" ftype="fasta"/> + <output name="taps_detected" file="output.1.tsv" ftype="tabular" lines_diff="2" /> + <output name="taps_family_counts" file="output.2.tsv" ftype="tabular" lines_diff="2" /> + <output name="taps_detected_extra" file="output.3.tsv" ftype="tabular" lines_diff="2" /> + </test> + <test expect_num_outputs="4"><!-- test with domtblout --> + <param name="protein_fasta_in" value="PUBLIC_Ectocarpus-sp7_proteins_head.fa" ftype="fasta"/> + <param name="domtblout" value="true"/> + <output name="taps_detected" file="output.1.tsv" ftype="tabular" /> + <output name="taps_family_counts" file="output.2.tsv" ftype="tabular" /> + <output name="taps_detected_extra" file="output.3.tsv" ftype="tabular" /> + <output name="domtbl" file="output.domtbl.tsv" ftype="tabular" lines_diff="10"/> + </test> + </tests> + <help><![CDATA[ +**What it does** + +TAPscan is a comprehensive tool for annotating TAPs with a special focus on species +belonging to the Archaeplastida. In general, the detection of TAPs is based on the detection +of highly conserved protein domains. + +During the first step, each sequence out of a species protein set is scanned for protein domains +(stored as profile Hidden Markov Models) using hmmsearch. The domains list consists of 154 profile +HMMs and functions as the domain reference during the hmmsearch command. + +Afterwards, by running TAPscan, specialized rules are applied to finally +assign the protein sequences to TAP families based on the detected domains in the previous step. +With the latest TAPscan v4, a protein set can be scanned for 137 different TAP families with high +accuracy through applying GA-thresholds and coverage values. + +**Output Files** + +TAPscan provides the user with three different output files. Each output file is +tab-separated. + +- **Output 1: "Detected TAPs"** - contains the detected domains and finally assigned TAP family + for each gene ID. If domains are assigned to a sequence but not all rules are fulfilled, the + sequence is assigned to “0_no_family_found”. +- **Output 2: "Family Counts"** is a summary of the number of members for each TAP family. +- **Output 3: "Detected TAPs Extra"** - is similar to output 1 but contains additional + information about subfamilies. + + ]]></help> + <creator> + <organization name="Rensing Lab" url="https://plantco.de"/> + </creator> + <citations> + <!-- TODO: add citation for TAPscan v4 paper when published --> + <citation type="doi">10.3390/genes12071055</citation> + </citations> +</tool> |
| b |
| diff -r 000000000000 -r 196795831b6a tapscan_coverage_values_v10.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tapscan_coverage_values_v10.txt Wed Feb 14 13:54:16 2024 +0000 |
| b |
| @@ -0,0 +1,154 @@ +Acetyltransf_1 0.0931034483 +AP2 0.3214285714 +ARID 0.0516393443 +AUX_IAA 0.1077981651 +Auxin_resp 0.4308510638 +B3 0.196875 +BES1_N 0.3888888889 +BSD 0.485915493 +BTB 0.3105095541 +bZIP_1 0.64453125 +bZIP_2 0.5948275862 +C1_2 0.1612903226 +CAF1C_H4-bd 0.1764705882 +CBFB_NFYA 0.2019230769 +CCT 0.4952830189 +CG-1 0.5568181818 +CSD 0.5878378378 +DDT 0.2678571429 +DEAD 0.1115591398 +dsrm 0.2330097087 +DUF260 0.3638059701 +DUF296 0.155075188 +DUF547 0.1181672026 +DUF573 0.3515625 +DUF632 0.3302603037 +DUF702 0.2314126394 +E2F_TDP 0.2078313253 +EIN3 0.5874125874 +FHA 0.1777456647 +FLO_LFY 0.4569138277 +FYRC 0.3928571429 +FYRN 0.46875 +GAGA_bind 0.2265774379 +GATA 0.5406976744 +GRAS 0.3990825688 +Helicase_C 0.118852459 +HLH 0.3191489362 +HMG_box 0.6866197183 +Homeobox 0.5485074627 +HSF_DNA-bind 0.071641791 +IQ 0.6428571429 +JmjC 0.5058139535 +JmjN 0.4261363636 +K-box 0.438 +KNOX1 0.578125 +KNOX2 0.6346153846 +LIM 0.4726027397 +MBF1 0.3049450549 +Med26 0.1939655172 +Med31 0.451048951 +Med6 0.1577868852 +Med7 0.1818181818 +MEKHLA 0.4357541899 +mTERF 0.4850543478 +Myb_DNA-binding 0.3636363636 +NAM 0.0142857143 +O-FucT 0.0914866582 +Ovate 0.3137755102 +PAH 0.1194267516 +PAZ 0.1564569536 +PC4 0.3848684211 +PHD 0.37 +Piwi 0.4082278481 +PLATZ 0.30625 +PP2C 0.4110962567 +QLQ 0.61875 +RB_B 0.1663987138 +Rcd1 0.4131355932 +Response_reg 0.5017241379 +RFX_DNA_binding 0.5257731959 +RHD_DNA_bind 0.4581447964 +Ribonuclease_3 0.0935013263 +RRN3 0.2342427093 +Runt 0.5714285714 +RWP-RK 0.4542253521 +S1FA 0.7386363636 +SBP 0.4274193548 +SET 0.0392857143 +SH2 0.4197247706 +Sigma70_r2 0.1675531915 +Sigma70_r3 0.6346153846 +Sigma70_r4 0.6388888889 +SIR2 0.45703125 +SNF2_N 0.1150895141 +SRF-TF 0.406779661 +SSXT 0.5856164384 +START 0.5067567568 +STAT_bind 0.3781869688 +SWIB 0.3267326733 +SWIRM 0.2532467532 +TANGO2 0.125 +TCP 0.219665272 +TCR 0.4943181818 +TEA 0.4078282828 +TF_AP-2 0.5660377358 +Tfb2 0.1757668712 +tify 0.5955882353 +Tub 0.0872576177 +VEFS-Box 0.4918831169 +WD40 0.0563909774 +WHIM1 0.5263157895 +Whirly 0.5648148148 +WRC 0.6223404255 +WRKY 0.2196969697 +WSD 0.1768867925 +YABBY 0.2804621849 +zf-AN1 0.4027777778 +zf-B_box 0.2746478873 +zf-C2H2 0.5080645161 +zf-C5HC2 0.253125 +zf-CCCH 0.5689655172 +zf-Dof 0.5955882353 +ZF-HD_dimer 0.5294117647 +zf-MIZ 0.6805555556 +zf-TAZ 0.1566455696 +zf-ZPR1 0.1608910891 +Zn_clus 0.5480769231 +Alfin-like 0.75 +BEL 0.75 +DNC 0.75 +FIE_clipped_for_HMM 0.75 +G2-like_Domain 0.75 +HRT 0.75 +KNOXC 0.75 +LUFS_Domain 0.75 +NF-YB 0.75 +NF-YC 0.75 +NOZZLE 0.75 +PINTOX 0.75 +STER_AP 0.75 +trihelix 0.75 +ULT_Domain 0.75 +VARL 0.75 +VOZ_Domain 0.75 +WOX_HD 0.75 +CXC 0.75 +bZIP_AUREO 0.75 +bZIP_CDD 0.50 +ALOG 0.75 +C2H2-IDD 0.75 +zf-MYST 0.75 +CBP 0.75 +DUF3591 0.75 +LOB2 0.75 +zz-ADA2 0.75 +NLP 0.75 +CRF 0.75 +GIY_YIG 0.75 +ZPR 0.75 +LD 0.75 +NDX 0.75 +SAWADEE 0.75 +C1HDZ 0.75 +C2HDZ 0.75 |
| b |
| diff -r 000000000000 -r 196795831b6a tapscan_domains_v12.txt.gz |
| b |
| Binary file tapscan_domains_v12.txt.gz has changed |
| b |
| diff -r 000000000000 -r 196795831b6a tapscan_rules_v81.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tapscan_rules_v81.txt Wed Feb 14 13:54:16 2024 +0000 |
| b |
| @@ -0,0 +1,298 @@ +ABI3/VP1;AP2;should not +ABI3/VP1;Auxin_resp;should not +ABI3/VP1;B3;should +ABI3/VP1;WRKY;should not +Alfin-like;Alfin-like;should +Alfin-like;Homeobox;should not +Alfin-like;zf-TAZ;should not +Alfin-like;PHD;should not +AP2;AP2;should +AP2;CRF;should not +ARF;Auxin_resp;should +Argonaute;Piwi;should +Argonaute;PAZ;should +ARID;ARID;should +Aux/IAA;AUX_IAA;should +Aux/IAA;Auxin_resp;should not +Aux/IAA;B3;should not +BBR/BPC;GAGA_bind;should +BES1;BES1_N;should +bHLH;HLH;should +bHLH;TCP;should not +bHLH_TCP;TCP;should +bHSH;TF_AP-2;should +BSD domain containing;BSD;should +bZIP1;bZIP_1;should +bZIP1;HLH;should not +bZIP1;Homeobox;should not +bZIP2;bZIP_2;should +bZIP2;HLH;should not +bZIP2;Homeobox;should not +bZIPAUREO;bZIP_AUREO;should +bZIPAUREO;HLH;should not +bZIPAUREO;Homeobox;should not +bZIPCDD;bZIP_CDD;should +bZIPCDD;HLH;should not +bZIPCDD;Homeobox;should not +C2C2_CO-like;CCT;should +C2C2_CO-like;GATA;should not +C2C2_CO-like;tify;should not +C2C2_CO-like;PLATZ;should not +C2C2_CO-like;zf-B_box;should +C2C2_Dof;zf-Dof;should +C2C2_Dof;GATA;should not +C2C2_GATA;GATA;should +C2C2_GATA;tify;should not +C2C2_GATA;zf-Dof;should not +C2C2_YABBY;YABBY;should +C2H2;zf-C2H2;should +C2H2;zf-MIZ;should not +C3H;AP2;should not +C3H;SRF-TF;should not +C3H;MYB-2R;should not +C3H;MYB-3R;should not +C3H;MYB-4R;should not +C3H;zf-C2H2;should not +C3H;zf-CCCH;should +CAMTA;CG-1;should +CAMTA;IQ;should +NF-YA;bZIP_1;should not +NF-YA;bZIP_2;should not +NF-YA;CBFB_NFYA;should +NF-YB;NF-YB;should +NF-YB;NF-YC;should not +NF-YC;NF-YB;should not +NF-YC;NF-YC;should +NF-YC;HMG_box;should not +Coactivator p15;PC4;should +CPP;TCR;should +CSD;CSD;should +CudA;STAT_bind;should +CudA;SH2;should +DBP;DNC;should +DBP;PP2C;should +DDT;DDT;should +DDT;Homeobox;should not +DDT;Alfin-like;should not +Dicer;Piwi;should not +Dicer;DEAD;should +Dicer;Helicase_C;should +Dicer;Ribonuclease_3;should +Dicer;dsrm;should +DUF246 domain containing/O-FucT;O-FucT;should +DUF296 domain containing;DUF296;should +DUF547 domain containing;DUF547;should +DUF632 domain containing;DUF632;should +DUF833 domain containing/TANGO2;TANGO2;should +E2F/DP;E2F_TDP;should +EIL;EIN3;should +FHA;FHA;should +GARP_ARR-B_G2;CCT;should not +GARP_ARR-B_G2;G2-like_Domain;should +GARP_ARR-B_G2;Response_reg;should +GARP_ARR-B_Myb;CCT;should not +GARP_ARR-B_Myb;Response_reg;should +GARP_ARR-B_Myb;Myb_DNA-binding;should +GARP_G2-like;G2-like_Domain;should +GARP_G2-like;Response_reg;should not +GARP_G2-like;Myb_DNA-binding;should not +GeBP;DUF573;should +GIF;SSXT;should +GNAT;Acetyltransf_1;should +GNAT;PHD;should not +GRAS;GRAS;should +GRF;QLQ;should +GRF;WRC;should +C3HDZ;Homeobox;should +C3HDZ;START;should +C3HDZ;MEKHLA;should +C4HDZ;Homeobox;should +C4HDZ;START;should +C4HDZ;MEKHLA;should not +HD_PLINC;ZF-HD_dimer;should +HD_WOX;WOX_HD;should +HD_DDT;Homeobox;should +HD_DDT;DDT;should +HD_DDT;WHIM1;should +HD_DDT;WSD;should +HD_PHD;PHD;should +HD_PHD;Homeobox;should +HD_PINTOX;Homeobox;should +HD_PINTOX;PINTOX;should +HD_BEL;Homeobox;should +HD_BEL;BEL;should +HD_KNOX1;Homeobox;should +HD_KNOX1;KNOX1;should +HD_KNOX1;KNOX2;should +HD_KNOX1;KNOXC;should not +HD_KNOX2;Homeobox;should +HD_KNOX2;KNOX1;should +HD_KNOX2;KNOX2;should +HD_KNOX2;KNOXC;should +HD-other;EIN3;should not +HD-other;Homeobox;should +HD-other;bZIP_1;should not +HD-other;WOX_HD;should not +HD-other;PINTOX;should not +HD-other;PHD;should not +HD-other;BEL;should not +HMG;ARID;should not +HMG;HMG_box;should +HMG;YABBY;should not +HRT;HRT;should +HSF;HSF_DNA-bind;should +IWS1;Med26;should +Jumonji_PKDM7;JmjC;should +Jumonji_PKDM7;JmjN;should +Jumonji_PKDM7;zf-C5HC2;should +Jumonji_PKDM7;FYRN;should +Jumonji_PKDM7;FYRC;should +Jumonji_Other;JmjC;should +LFY;FLO_LFY;should +LIM;two_or_more_LIM;should +LUG;LUFS_Domain;should +MADS;SRF-TF;should +MADS;K-box;should not +MADS_MIKC;SRF-TF;should +MADS_MIKC;K-box;should +MBF1;MBF1;should +Med6;Med6;should +Med7;Med7;should +mTERF;mTERF;should +MYB-2R;G2-like_Domain;should not +MYB-2R;Response_reg;should not +MYB-2R;trihelix;should not +MYB-2R;MYB-2R;should +MYB-3R;G2-like_Domain;should not +MYB-3R;Response_reg;should not +MYB-3R;trihelix;should not +MYB-3R;MYB-3R;should +MYB-4R;G2-like_Domain;should not +MYB-4R;Response_reg;should not +MYB-4R;trihelix;should not +MYB-4R;MYB-4R;should +MYB-related;ARID;should not +MYB-related;G2-like_Domain;should not +MYB-related;Myb_DNA-binding;should +MYB-related;Response_reg;should not +MYB-related;trihelix;should not +MYB-related;MYB-2R;should not +MYB-related;MYB-3R;should not +MYB-related;MYB-4R;should not +NAC;NAM;should +NZZ;NOZZLE;should +OFP;Ovate;should +PcG_EZ;CXC;should +PcG_EZ;SET;should +PcG_FIE;FIE_clipped_for_HMM;should +PcG_FIE;WD40;should +PcG_VEFS;VEFS-Box;should +PcG_VEFS;zf-C2H2;should not +PcG_MSI;WD40;should +PcG_MSI;CAF1C_H4-bd;should +PcG_MSI;FIE_clipped_for_HMM;should not +PHD;Myb_DNA-binding;should not +PHD;Alfin-like;should not +PHD;ARID;should not +PHD;DDT;should not +PHD;Homeobox;should not +PHD;JmjC;should not +PHD;JmjN;should not +PHD;PHD;should +PHD;SWIB;should not +PHD;zf-TAZ;should not +PHD;zf-MIZ;should not +PHD;zf-CCCH;should not +PHD;HMG_box;should not +PLATZ;PLATZ;should +Pseudo ARR-B;CCT;should +Pseudo ARR-B;Response_reg;should +Pseudo ARR-B;tify;should not +RB;RB_B;should +Rcd1-like;Rcd1;should +Rel;RHD_DNA_bind;should +RF-X;RFX_DNA_binding;should +RRN3;RRN3;should +Runt;Runt;should +S1Fa-like;S1FA;should +SAP;STER_AP;should +SBP;SBP;should +SET;zf-C2H2;should not +SET;TCR;should not +SET;CXC;should not +SET;PHD;should not +SET;Myb_DNA-binding;should not +SET;SET;should +Sigma70-like;Sigma70_r2;should +Sigma70-like;Sigma70_r3;should +Sigma70-like;Sigma70_r4;should +Sin3;PAH;should +Sin3;WRKY;should not +Sir2;SIR2;should +SOH1;Med31;should +SRS;DUF702;should +SWI/SNF_BAF60b;SWIB;should +SWI/SNF_SNF2;AP2;should not +SWI/SNF_SNF2;PHD;should not +SWI/SNF_SNF2;SNF2_N;should +SWI/SNF_SNF2;zf-CCCH;should not +SWI/SNF_SNF2;Myb_DNA-binding;should not +SWI/SNF_SNF2;HMG_box;should not +SWI/SNF_SWI3;SWIRM;should +SWI/SNF_SWI3;Myb_DNA-binding;should +TEA;TEA;should +TFb2;Tfb2;should +tify;tify;should +TRAF;BTB;should +TRAF;zf-TAZ;should not +Trihelix;trihelix;should +TUB;Tub;should +ULT;ULT_Domain;should +VARL;VARL;should +VOZ;VOZ_Domain;should +Whirly;Whirly;should +WRKY;WRKY;should +Zinc finger, AN1 and A20 type;zf-AN1;should +Zinc finger, AN1 and A20 type;zf-C2H2;should not +Zinc finger, MIZ type;zf-MIZ;should +Zinc finger, MIZ type;zf-C2H2;should not +Zinc finger, ZPR1;zf-ZPR1;should +Zn_clus;Zn_clus;should +ALOG;ALOG;should +C2H2;C2H2-IDD;should not +C2H2_IDD;C2H2-IDD;should +C2H2_IDD;zf-C2H2;should +MYST;zf-MYST;should +CBP;CBP;should +CBP;zf-TAZ;should +CBP;BTB;should not +TAFII250;DUF3591;should +LOB1;bZIP_1;should not +LOB1;bZIP_2;should not +LOB1;DUF260;should +LOB1;HLH;should not +LOB1;Homeobox;should not +LOB2;LOB2;should +LDL/FLD;SWIRM;should +LDL/FLD;Myb_DNA-binding;should not +ADA2;zz-ADA2;should +ADA2;Myb_DNA-binding;should +RKD;RWP-RK;should +RKD;NLP;should not +NLP;RWP-RK;should +NLP;NLP;should +CRF;CRF;should +CRF;AP2;should +GIY_YIG;GIY_YIG;should +ZPR;ZPR;should +HD-LD;LD;should +HD-NDX;NDX;should +HD-SAWADEE;SAWADEE;should +C1HDZ;C1HDZ;should +C1HDZ;Homeobox;should +C1HDZ;START;should not +C1HDZ;MEKHLA;should not +C2HDZ;C2HDZ;should +C2HDZ;Homeobox;should +C2HDZ;START;should not +C2HDZ;MEKHLA;should not |
| b |
| diff -r 000000000000 -r 196795831b6a tapscan_script_v74.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tapscan_script_v74.pl Wed Feb 14 13:54:16 2024 +0000 |
| [ |
| b'@@ -0,0 +1,916 @@\n+#!/usr/bin/perl\n+use strict;\n+use warnings;\n+\n+# Written by Gerrit Timmerhaus (gerrit.timmerhaus@biologie.uni-freiburg.de).\n+# Changes included by Kristian Ullrich, Per Wilhelmsson and Romy Petroll.\n+\n+# Script to extract all detected domains out of a hmmsearch results file and classify the families of all used proteins based on these domains.\n+# The classification depends on a table which contains all known classification rules for the protein families of interest and on specific coverage values defined for every domain.\n+# The script provides three outputs, namely output.1, output.2 and output.3. The output files are tables in ";"-delimited format.\n+# The structure of output.1 is: "sequence ID ; TAP family ; number of classifications ; domains". \n+# Output.3 shares in principle the same structure as output.1, except that subfamilies are considered. ("sequence ID ; TAP family ; Subfamily ; number of classifications ; domains")\n+# The superior TAP family is specified first, followed by the subfamily. If a TAP family has no subfamily, the TAP family is specified first and then a "-". \n+# The structure of output.2 is: "TAP family";"number of detected proteins".\n+# More than one entry for a protein is possible because the classification rules may allow more than one classification.\n+#\n+# The script must be startet with the arguments <hmmsearch output file> <classification rules> <output classifications file> <output family statistics file> <output subfamily classifications file> <"filter" if desired>\n+\n+if (!@ARGV or ($ARGV [0] eq "-h") or ($ARGV [0] eq "-help")) {\n+\tprint "Usage: extract.and.classify.pl <hmmsearch output file> <classification rules> <output classifications file> <output family statistics file> <output subfamily classifications file> <\\"filter\\" (if desired)>\\n\\n";\n+\texit;\n+}\n+\n+# hmmsearch_output: domtblout file\n+my $hmmsearch_output = $ARGV [0];\n+# decision_table: rules file\n+my $decision_table = $ARGV [1];\n+# family_classifications: output.1\n+my $family_classifications = $ARGV [2]; \n+# family_statistics: output.2\n+my $family_statistics = $ARGV [3];\n+# subfamily_classifications: output.3\n+my $subfamily_classifications = $ARGV [4];\n+# domspec_cuts: coverage values file\n+my $domspec_cuts = $ARGV [5];\n+# gene_model_filte: filter for ARATH and ORYSA\n+my $gene_model_filter = $ARGV [6];\n+\n+if ($family_statistics eq "") {\n+\tprint "Usage: extract.and.classify.pl <hmmsearch output file> <classification rules> <output classifications file> <output family statistics file> <output subfamil classifications file> <\\"filter\\" (if desired)>\\n\\n";\n+\texit;\n+}\n+\n+if ($gene_model_filter and $gene_model_filter eq "filter") {\n+\tprint "\\nGene model filter is activated. It only works for TAIR (Arabidopsis) and TIGR (Rice) proteins up to now\\n";\n+}\n+\n+# Array where the $hmmsearch-output/domtblout file will be stored\n+my @output = ();\n+# Array with domain-specific coverage values\n+my @cuts = ();\n+# Array with rules\n+my @dec_table = ();\n+# Counter for the number of detected domains in the hmmsearch output file\n+my $entry_counter = 0;\n+# Containes the actual result for a query sequence\n+my $akt_entry = "";\n+# Used to define query entry to ignore similar domains\n+my $whole_entry = ""; \n+# Includes the final entries after ignoring similar domains\n+my @results_of_extraction = ();\n+# Used to define query entry to ignore similar domains\n+my $extracted_domain = "";\n+# Used to define query entry to ignore similar domains\n+my $present = "";\n+# Used to define query entry to ignore similar domains\n+my $protein = ""; \n+\n+my $lek = "";\n+\n+############################################\n+### 1. Read in the hmmsearch output file ###\n+############################################\n+\n+print "\\n*** reading in $hmmsearch_output ***\\n\\n";\n+\n+@output = get_file_data("$hmmsearch_output");\n+\n+print "*** Parsing $hmmsearch_output ***\\n\\n";\n+\n+# If wrong format exit the program, ninth row from the end\n+if ($output [-9] !~ /^# Program: hmms'..b'ilies into $family_list to create output.2\n+foreach my $fcf_line (@family_classifications_file) {\n+\t$fcf_line =~ /^[^;]+;([^;]+)/;\n+\t#push @family_list, "$1";\n+\tprint FAMILY_CLASSIFICATIONS "$fcf_line";\n+}\n+close (FAMILY_CLASSIFICATIONS);\n+\n+foreach my $fcf_line (@subfamily_classifications_file) {\n+\t$fcf_line =~ /^[^;]+;([^;]+);([^;]+)/;\n+\tif ($2 eq "-") {\n+\t\tpush @family_list, "$1";\n+\t}\n+\telse {\n+\t\tpush @family_list, "$2";\n+\t}\n+\tprint SUBFAMILY_CLASSIFICATIONS "$fcf_line";\n+} \n+\n+close (SUBFAMILY_CLASSIFICATIONS);\n+\n+print "*** calculating the family statistics and write it in $family_statistics ***\\n\\n";\n+\n+##################################\n+### 5. Create the output files ###\n+##################################\n+\n+my $statistics_outputfile = "$family_statistics";\n+\n+unless (open(FAMILY_STATISTICS, ">$statistics_outputfile")) {\n+\tprint "Cannot open file \\"$statistics_outputfile\\" to write to!!\\n\\n";\n+\texit;\n+}\n+\n+# Count the family entries\n+my @output_family_statistics = ();\n+my @gefundene_familien = ();\n+my $family_counter = 1;\n+\n+shift @family_list;\n+@family_list = sort @family_list;\n+\n+\n+my $old_family = "";\n+push @family_list, \'BAD FIX\'; # Makes to loop go through every fam and stops at non fam(BAD FIX).\n+\n+foreach my $line (@family_list) {\n+\tif ($line eq $old_family) {\n+\t\t$family_counter++;\n+\t}\n+\telsif ($old_family ne "") {\n+\t\tpush (@output_family_statistics,"$old_family;$family_counter\\n");\n+\t\t# Add all found families to a list of found families\n+\t\tpush (@gefundene_familien,"$old_family");\n+\t $family_counter=1;\n+\t}\n+\t$old_family = $line;\n+}\n+\n+my %hash = ();\n+\n+# Put all families from the classifictaion ruled and all found families in a hash\n+foreach my $element (@gefundene_familien,@liste_alle_familien) {$hash{$element}++;}\n+\n+# Remove merged families\n+delete $hash{\'GARP_ARR-B_Myb\'};\n+delete $hash{\'GARP_ARR-B_G2\'};\n+delete $hash{\'bZIP1\'};\n+delete $hash{\'bZIP2\'};\n+delete $hash{\'bZIPAUREO\'};\n+delete $hash{\'bZIPCDD\'};\n+delete $hash{\'HRT\'};\n+delete $hash{\'GIY_YIG\'};\n+\n+# Add all not found families from the classification rules to @output_family_statistics \n+# With zero as number of families found \n+\n+foreach my $element (keys %hash) {\n+\tif (($hash{$element} == 1) and ( $element eq "0_no_family_found")) {\n+\tpush (@output_family_statistics,"$element;$unclassified_families\\n");\n+\t}\n+\tif (($hash{$element} == 1) and ($element ne "0_no_family_found")) {\n+\tpush (@output_family_statistics,"$element;0\\n");\n+\t}\n+}\n+\n+# Sort @output_family_statistics caseinsensitive-alphabetically \n+my @sortierte_statistik = sort {lc $a cmp lc $b} @output_family_statistics;\n+\n+# Print headline to @sortierte_statistik\n+unshift (@sortierte_statistik,"family statistics for $hmmsearch_output\\n");\n+\n+print FAMILY_STATISTICS @sortierte_statistik;\n+\n+# Print FAMILY_STATISTICS "$old_family;$family_counter\\n";\n+\n+close (FAMILY_STATISTICS);\n+\n+#################################################\n+### 6. Give out some statistical informations ###\n+#################################################\n+\n+$entry_counter [0] = 0;\n+my $sum = 0;\n+foreach my $entry (@entry_counter) {\n+\t$sum += $entry;\n+}\n+print "$classified_families classifications were found for $sum proteins.\\n";\n+print "This classifications are divided in:\\n";\n+my $count = 0;\n+foreach my $element (@entry_counter) {\n+\tif ($count != 0) {\n+\t\tprint "$element proteins were classified for $count";\n+\t\tif ($count == 1) {print " family\\n";}\n+\t\telse {print " different families\\n";}\n+\t}\n+\t$count++;\n+}\n+print "\\n$unclassified_families proteins could not be classified\\n\\n";\n+\n+print "*** The results were written in $family_classifications and $subfamily_classifications ***\\n";\n+print "*** done ***\\n\\n";\n+\n+exit;\n+\n+sub get_file_data {\n+\t\n+\tmy ($filename) = @_;\n+\n+\tuse strict;\n+\tuse warnings;\n+\n+\tmy @filedata = ();\n+\n+\tunless( open(GET_FILE_DATA, $filename)) {\n+\t\tprint STDERR "Cannot open file \\"$filename\\"n\\n";\n+\t\texit;\n+\t}\n+\n+\t@filedata = <GET_FILE_DATA>;\n+\n+\tclose GET_FILE_DATA;\n+\n+\treturn @filedata;\n+}\n+\n' |
| b |
| diff -r 000000000000 -r 196795831b6a test-data/PUBLIC_Ectocarpus-sp7_proteins_head.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/PUBLIC_Ectocarpus-sp7_proteins_head.fa Wed Feb 14 13:54:16 2024 +0000 |
| b |
| b'@@ -0,0 +1,32 @@\n+>Ec-00_001160.1 Forkhead-associated (FHA) domain (809) ;mRNA; f:1452084-1459465\n+MEPPPPVPAPIIVAGTRAKETGVDGNSSAATAGSTAAVSPPLDKAQQSALDPAKDGSASLPPPPPKTAGSLNGDGVKAVASGGYKPPSWGLTEAPGASGLSLTVLKGGVEVGSISLDNRTHVLLGRQQGVVDVLLEHPSISRKHAILQHGQNGALFLFDNGSTHGCSVNKKKIPPKEFHRLHVGDVIKFGESTRLYALEGPEELRPAEYESDNLRNLRLDAGRKQLAAKLAKIKAGGAGEGGGKGGGDSGEYGISWGFDEDAVAEEEDEDGDGAERDEDEVELPDYLKTEAQKRRRRDTKIGLTEDNVHKRDAKLFEKLQLKLTKIEEIEETIRSKNKARERGKEGGEGGSGGEEGGRKAGRGTEDGEEDDDYYDRTAPVVPTSSGTSKSDLASKKAEIKARRFGARDKTKKLSAVTPPADDTAAAERKRGVGPGQNEAAAQSLEALTRRGEAVVEDLERTQAGLAELEAEEAGEAALAEDGGVAADPLDMFMTENRRKERQQAIVRLTAKREALREEQALLKVMVEAARPSMPTLKKSPAPAAATASVAATKEETVPVPETTTRGSGSSSSDQAAEPDDRKDTRAAAVDREHMPGGGGYGEAMPGSTGSGSHEKYGDEVEKTAPSRAVLAPEAASTLGSMPSPVAPPCRSNAIPEARKSPAGTVASAGTRERGVEKGQAETRHPGVKEGKEASGTKKRGTPVGTSMLPPPPSKRQQRRAENSKAGLAGENDDDPVEPKAKRTVKGPAMPPPLGKPSKVGGGVSVPTAVARVGEKVAGKEALEGGDVDWVPPKDALEKMAALNRKFGY*\n+>Ec-00_001310.1 SET domain protein (668) ;mRNA; r:1563326-1571582\n+MAIPTSKDEDLLDDEQAPAVAAAQDQELVAGSDAGTVAPSKKKKKKKKKKKSPQQNNQLAQVQVYETIKDLDTFKVTEDSVSGRCVIASRDLKAGELVLREPPFVKVVRRDCASRQCAYCCQQVTERGKIEADVPFAVYCSRACQAREDALRAAEASALGKLAGISAARDVDIDLLRMLLRLLITRAKALGLREPSGDSDSVSRGVDEEGEDGTMGEGLFLRQQWENLYALMHHREAMAPDWISVVREAGEDLLQLLPEWVRFDVEEVVQLACRVNVNAHGLRDDSGANLVIGVGMFPLTAMINHACRPNCTFVYFGGNLEVRTLEPVSAGAELSVYYIDLLQSTAARRQELLTSKHFLCKCSRCENPSSMDDYLDGVCCTDCGERGCLTPTPPPSAEDILAAQLAQLGEGLADESAANGSGKGMGKKKGGGGGSTTSRRETSVNPSAALGAKDGSGNGSRAGGVRGGSESVVQKVYCSACGREYPGTAVEESVARAKALWDAAMAVVRAKSFSLARKSLEKWLQDYDAGAVLPPTATKKKSVKKDRLKLHPANVMVVQTLVPLSNCCTFEEDHAASARHLRRAVSAMEAVYPANFPELGDFHAALADANDALLQKRGQTLPKKSRSQAVSERKQALERAATIRSVCLGKDHPATREAARALDRVTG*\n+>Ec-00_001360.1 aureochrome 2 (442) ;mRNA; r:1593684-1598236\n+MPASVKPPVFTSMVHRKVQHHTSWQDADFQPDDLGLDLTDLSTMTGFLMNEVPDNTGHFYPPWANELSPLVKDEPSAFMMPDPAAPRQPQPRQQQQQQDQRLPAPEGLPAAPVADPALDVIMGGATGSSRPGSTTSSSSSGASSMLRAPKAAAAAGAAALGGVGSTFRKTPAGGVTRRRSSSKEEQAKKRRERNRVLARRTRLRKKFFFQSLQQQVARLQRENERLKGIVTTRCPDSVGEILMSCRSKMPSMVADCAGQATAVLDQSGFLLVKALQSSQPSFCVTDPQMPDNPIVYASDTFIELTGYDRAQVLGRNCRFLQGPDTDPDAVAKIRKGIEEGSDTSVYLRQYKADGTVFWNHVFVAALRNSEHKIINYVGIQHPLDKEPSPEVVACINNGKEQEIMSVQEEDRPAGWGGQWPEDVNGDLATLDHLMAGGWGTD*\n+>Ec-00_001730.1 WD40 repeat (1089) ;mRNA; r:1948051-1960039\n+MSGYSRAGVGVGGGGGGGGGGGGGGGGGGGGANSNRGASASRGPGGALQGAHMAGTVATVAAAGGQQSKSVAVFRSLELCDLLKLEIGNITSEMGQHLEEREEYEKKFRQQLAEMDRIQHSLKQLQEAHMVMKQQYEDEIVRLRHQLESSHPLKSDPDSRGGGAGSSVQGVSGGMSMGSTGPSQSHSSGPPHKGLGGMPVGGPHGLGSPRKQPGQLEPGAASLRGPMLTAPGAQLVGGGGMGAGSGVVLPGMAMRGRGHGEDDYGGGRGQGPNGQTLEPLSKRPRLADLQGPPGPLPPHVLHPPHGFSQSQHQQHPQQHQQHPPPHHKGGFFGGRPGAPPAAVAGGGGGGGGGGDNGGRRPGGPEGQYRWPGGSGPGGGGGGGDSGGAGGGGGAGVPAMGPGKQRQLGPGGGGAPPRELSSSTGKGGGAGGGGGGNGNGMVEIPDAIPAELSYQVRYEAEEGEGGRAAAAKDGLAVELAKSQDLRSVVCCVRFSTDGTKIAAGSHSCVKVFDVNSFKQLYVCRKQVQEEQGAPQTADGGDPYVRAVCFSPDGLSIVAGMEKNSAKVLVLEEEGGRQGAITLSGHESEVYSLDWVSDMIASGSGDGRIRLWDSVTGACKASLGDMGGPQDGVTSVVLRQDTTMVAAASIDRVVHVWSTQTHKILHRLDGHSESVYAITFSADGNRLVSGGLDKTIKVWDLGPGSEGRLSPQAKTLPGGHKDYVLSVCFSQDGKYIISGSKDRSVTMWDARLMKRVATITGFKNSVIGVSASPFNSMFATGSGDNLVCVWNYGDRDDYRRQSESAVAPASSGRSSSAEKPRPPSRSSRSASPPISNASSLRKDDRASSPSPPRASPGRGGRGGDGDGGSSRGRGGVGLVEKGRGGSAGAGAGATNGKGRTSPSSDEREGRTAAGSEERKRSGSKESSRTGPRGGGGGHDSDRKQQRHESKHRQGRAGSPSDGRRSSSNSKERSSGRGGGVEPMDEDDDTEEEKQGEEAEGGGGKRAGNTDETPLGRKRGSGGVGNGSGGRKALNRSSSSSSSSSGSGGGGGGGGGGGGGGGRGPSPEKRGKAALPPKKELQRRDSPSSAE*\n+>Ec-00_010160.1 Phytochrome-like protein (978) ;mRNA; f:17223525-17226527\n+MDPHTHTNERIHMRSSGGCDGDMRTASTHIQSCGCAFAIEETADDMYPSGLRILGVSQNAVEAPWAYASSVSDLLGKDLGHLLRVECVRTVRSLVTRYAQACKPSHEDDDHISPKANRITADACPSPRIRGELRPGAASGAEGDIASFTVTGSNPGVYLVDVERHGSDCARVEHTPGLLLLGDLLESIPVGSNPVESTAALCDALAKSMPAYDRVMVYRFAPDGSGQDDGSGEVVHESVRAGADIGSSYLNLRFPALDIPPIARKLFKLVGVRFIADTSAPAVPMITLHDQASSPLDLFRSALRAPAECHLRYLRNMGVKASLVVSIAVDGGTWGLFSFHSYTRTVHPSCEERLSVEMAASVVSSLISRYQREEIAATALSLSRTLGNLGNYTRVNDFLSADHHSLLGILDVDAVILCEHLRSVTLYGKKDITLSLEECQELRNGDGDESSEMAISFRTLGARGVAFFWVRSFFVAFLRGSIANSVKWAGNPDAPVNKDEVMTPRASFELFMRTSGARCKAWSPLTVDLLNMVRQGFSSQLYAEALPADLQETFARVSHELRTPFHGVMGALEILEAGNGIMGAEEQLDVIRSALRCGGSMMSTLNDILEIAKDRNNTE'..b'DAADASTTSGNIRGRRRAAAARTSHRMVASSRNWEWGRAKSSGDAPTPAAQAFGSDQELKLGVLLLNLGGPERPEDVQPFLFNLFADPDIIRLPKLVQWLQNPIAAVLAARRAPQSKSAYESIGGGSPIVSWTNAQAKGIASQLEAKGLSGTKCYVGMRYWHPFTEAALEAVEDDEINALVILPLYPQFSISTSGSSLRILNEEFTRRPEQWGHKNVVHTVVPSYHDRPGYVNAMASLIAREVAEYTPEQRMQGVQVLFSAHGVPKSYIDAGDPYKAQIESCVKLISEKVDGINAEGGPGAKPGSSGAAAGGVTYHLSYQSRVGPVEWLQPYTDAKIHELADNGCKNLVVVPVSFVSEHIETLEEIDMEYREVAEEAGITNWRRVPALNTDPAFIEDMADMVVEALALPTLTVSEAFTRNNCDRKEAEGFLEKALDGMYGMPKTGGKPPKSGKVGGAGAAGANASSGGGGGADGEGSGDKRKEAARVLSTLSGAAFAADGVGREIAGLFTATSDGIFF*\n+>Ec-01_005370.1 Hypothetical protein (68) ;mRNA; r:4610968-4611171\n+MAHGVLLSSTTGWWRNLQVWCPCCCIFQFEPVGFYLHAKRGPSRLQTISLVGQRIMIFADPKQLESA*\n+>Ec-01_005380.1 Aminoalcohol phosphotransferase (396) ;mRNA; r:4612890-4621264\n+MSDFRGGSVVTPRAVKYLRRYQYHGSDRSLLYKYVLSPLAETCLVFLPSWMAPNLVTTIGLGLTTASYLLLYLSMPGLVSNESTPWWVFPAAAAGLIVYQTLDNMDGKQARRTGSSSPLGLIFDHGCDAINCCFGVVFVSCILDAGSSLPLLAAIVLNQLVPFFFTTWEHYYTHELILPIVNGPSEGVVLGAVSACLRGVYGPEFFSAPREGLRGWPLGEVLMACSLLGVALTVFKQIVLVARARRLAGRGMVNPIRDASWFVVLMVLGGSWASVRPELFLTKPYTMVLLFGLLHVDMAVHLMVCHVCNMVCRSFRPILIPFMLVAANSFFPGGPLLGERTLVLGFTVLTFIYEALYLYLVVTETSLALDIYVFKLGKRSDATRNGGGTTTSKKD*\n+>Ec-01_005390.1 WD40 repeat (407) ;mRNA; f:4621750-4629304\n+MSESKDGGGSSGGGGRAPYLDVSEDKMYDSRLRGDYELHDYAHSPSNQRLKEQRESKGGGSIADRVLGARLHPGLTISENSAEVFVTRFAPEGNLLAAACGDGTIRIFHVSTGRLAYNLQSSSSQSLPTTSLRFRPAIAQSKTRNVLVSCNAGGEIQHWHITSGKCLSTIKEEDQFYALDYRKDGSVFAATGKNHTVHIYDETTKHEISLLQGGSGYGSSSAPGHSNRVFAVKFHPEDPEVLLTAGWDNTVQFWDMRVGHSVRSIFGPHISGDSLDICGNEILTGSWRPNDPLEIWDYGTAELKETIPWNRSSAQGVQPTLLYTAQFSNTADGGRYIAAGGSGANEAKIFDHTANNQLVGTVTGLPRGVFTVDFSPSPDAKKVAVAGGDASIRIIDIVEEKTADVY*\n+>Ec-01_005400.1 hypothetical protein (1653) ;mRNA; r:4630582-4637601\n+MSYVERLREELMGSLGFEVESQSCLSGNRGSAGERDQKARRRRSGGGAIAAVQVGVPYPHALRPRQIQQQQQQQQQQQGRRAADDSSCMVGGGCLRPQRADLSRVPRSRRRPASASSVRYLANKSNMNDGYSGANKDMDVLLKICRDQGRANSLNPADFDSSISAIANLPVKQFMTAAKDGWGPYFRNLHSNMRKETTARSTTTAKRMNRPPPERRRRSSQQQHPPLPLPRRASSGATKDEKKNNRRYSKVANDLHHMVRSADVASSDSNNRRRRRQSEDDNSDGLAASRARAAKRQAEEMVIRRMGVRLQALWQELKIPDPDRAYVTAAYLDAGGGGGGFSGQRARGGGGIEEPPVGSGGGGGGNSGPTSENVNRELTRQIRLLLEHRAATVKVLRCVNARESRLCEVEQALQAFQWHRGLDTDALGVVVSLAGLRDASLDVVRAVEEWRTNLWQPRAFCWRGVNYLEKMWQDTIFLRSPVGQSLLASVGLEAKDMTFVLYPSGNGGGAPSAIPGDSSTCGSELSDGERDTKGDVLRPPPAIVRLTPKAELVSAYRARYFEQSPVVGAGGRPGGESGDVGGERGRLKSTAAAATAAMEAVVVQEAALQRRVEAERVRLATKGCFVPLLRWLPGDSRNPAPKGKRRWSSSSPPSVPPAARNPPPGAESETHAAKDVVDSFAHNDGRSGVPAVRDVSVSAAIPPDPLTGGELPLQLQQQQQQQQQQQQPATVTNTSTSAGASLFRGSVRDSAQQERLTVSGEGDGHEEEDREDEDGDEGEDREQDEKGQASLLHTEGVRSQKQPERVEEGGHDQTQSQSEGPQAGQGSTAEVEGRSAFHENGVGEQTKMTNPLAPSFRLETSLHPPGAEEESRSETDIARRSACTVGAEARAGENGCLKVVATSECGVGSAGIPVGGVSSGDDGDGDTDKPSIVPGELRQEGIVDEERSVAPSVCYDDDFEEDAEGYSLSDSETSGEGGSADFGECLNGSISSDDAKDARRDQGVAHTTEEEGAAISCGEGRQNPVVGGNNVLTEHQRQQQLVEERETVAATAALRIQRSARAWLCRGGRLRNTKPDEKEAPQESAKEGKQEWLDAWRARRHSQVRQAQRPEEIPEEAAIRIQTTVRGGLASRQVEELRRELHARKMAATLKIQAFVRRRRARRNTTAINRREISENVDLRQQQEAEPDPELAGVGVCRGTARQGEEEILAVSSEFDSAEAATRSDRSARTIQKAVRRRLSSGNTTSKADGLPAMEIKGQPKTLEEEAQGQPDAGKRGDGPGAVLVNRCGSGDGSFIPSQQAPVVAAESSSPVVRAGDELDSPSSPAAVDGTPSSTPASGADGLDARLSDADGKNNRPDVAESAFVEAGLPSTSLPSEDSAESETADDAAGGDLSQTLQHPREGSNFQPEGASSWSPLPSPTTPRPEAAERGQQENDHQPRQVEGGGEAAVSLPPSSATACPPTLGPTVDVFVGLLPVDTDTSADAMMTTVGGTHAGGGAESLGFSAVGLSPVPERQALVAVAASASEYGSEDPDFDALSSSPGGSSRGGVSRGEEENAGVAARVATRSSVVTTGEKVEEPTESRGVERVAAPVAAAVAGAATEKNLAGDIEPVSLAAAEVATTKEVERGGTQCTEDGSDNSSIVSLSVSSGS*\n+>Ec-01_005410.1 Hypothetical protein (154) ;mRNA; f:4637945-4638913\n+MGSAWSRKQATIHSSFQAAPLAVASWDLQHVKILHDRFRDDGYEFGIDRVSLRDLISSALPLARGVTGDLWEIYVEHEQEMLYPLELFAAVALQCHGLSRCGVSTIYSCPVFLCKNCKNPSYFHAHFGPCRSEGRRLDWCISSKASPGIRVHF*\n+>Ec-01_005410.2 Hypothetical protein (200) ;mRNA; f:4641160-4641759\n+MATGDSKTEFSIPTPIPVNPCNGTCSLSSPVDASCKIVVTQHLYAVHAAIVLRGNSKQLKTRDLVRICARFDGKPVATLPPQSFHAHSSPAHCVVRLKKAGALHNRTAVPPSCRWPLATPLVYLFDQGSNHRHLRRQQHRSKRLLVCIAPTTIKPARPTTVVPSLRQPPPRLCPAPRPAPLVPATHHPRAIHQQRGRRC*\n+>Ec-01_005410.3 Hypothetical protein (102) ;mRNA; f:4639028-4639623\n+MRGALLKLGVYRWLSVQRKRENSTKTRPQPTDSVCAVEVYYRVGVSFRAPRLLRSQPTEKRRGLCSTCLTLKADKRFRTTSWPSASRPSWEHFRRSLDPAV*\n' |
| b |
| diff -r 000000000000 -r 196795831b6a test-data/output.1.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/output.1.tsv Wed Feb 14 13:54:16 2024 +0000 |
| b |
| @@ -0,0 +1,10 @@ +sequence ID TAP family number of classifications domains +Ec-00_001160.1 FHA 1 FHA; +Ec-00_001310.1 SET 1 SET; +Ec-00_001360.1 bZIP 2 bZIP_AUREO;bZIP_2;bZIP_CDD; +Ec-00_001700.1 RKD 1 RWP-RK; +Ec-00_001730.1 0_no_family_found 0 WD40; +Ec-00_010160.1 0_no_family_found 0 Response_reg; +Ec-01_004960.2 0_no_family_found 0 K-box; +Ec-01_005390.1 0_no_family_found 0 WD40; +Ec-01_005970.1 GNAT 1 Acetyltransf_1; |
| b |
| diff -r 000000000000 -r 196795831b6a test-data/output.2.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/output.2.tsv Wed Feb 14 13:54:16 2024 +0000 |
| b |
| @@ -0,0 +1,139 @@ +TAP family number of detected proteins +0_no_family_found 4 +ABI3/VP1 0 +ADA2 0 +Alfin-like 0 +ALOG 0 +AP2 0 +ARF 0 +Argonaute 0 +ARID 0 +Aux/IAA 0 +BBR/BPC 0 +BES1 0 +bHLH 0 +bHLH_TCP 0 +bHSH 0 +BSD domain containing 0 +bZIP 1 +C1HDZ 0 +C2C2_CO-like 0 +C2C2_Dof 0 +C2C2_GATA 0 +C2C2_YABBY 0 +C2H2 0 +C2H2_IDD 0 +C2HDZ 0 +C3H 0 +C3HDZ 0 +C4HDZ 0 +CAMTA 0 +CBP 0 +Coactivator p15 0 +CPP 0 +CRF 0 +CSD 0 +CudA 0 +DBP 0 +DDT 0 +Dicer 0 +DUF246 domain containing/O-FucT 0 +DUF296 domain containing 0 +DUF547 domain containing 0 +DUF632 domain containing 0 +DUF833 domain containing/TANGO2 0 +E2F/DP 0 +EIL 0 +ET 0 +FHA 1 +GARP_ARR-B 0 +GARP_G2-like 0 +GeBP 0 +GIF 0 +GNAT 1 +GRAS 0 +GRF 0 +HD-LD 0 +HD-NDX 0 +HD-other 0 +HD-SAWADEE 0 +HD_BEL 0 +HD_DDT 0 +HD_KNOX1 0 +HD_KNOX2 0 +HD_PHD 0 +HD_PINTOX 0 +HD_PLINC 0 +HD_WOX 0 +HMG 0 +HSF 0 +IWS1 0 +Jumonji_Other 0 +Jumonji_PKDM7 0 +LDL/FLD 0 +LFY 0 +LIM 0 +LOB1 0 +LOB2 0 +LUG 0 +MADS 0 +MADS_MIKC 0 +MBF1 0 +Med6 0 +Med7 0 +mTERF 0 +MYB-2R 0 +MYB-3R 0 +MYB-4R 0 +MYB-related 0 +MYST 0 +NAC 0 +NF-YA 0 +NF-YB 0 +NF-YC 0 +NLP 0 +NZZ 0 +OFP 0 +PcG_EZ 0 +PcG_FIE 0 +PcG_MSI 0 +PcG_VEFS 0 +PHD 0 +PLATZ 0 +Pseudo ARR-B 0 +RB 0 +Rcd1-like 0 +Rel 0 +RF-X 0 +RKD 1 +RRN3 0 +Runt 0 +S1Fa-like 0 +SAP 0 +SBP 0 +SET 1 +Sigma70-like 0 +Sin3 0 +Sir2 0 +SOH1 0 +SRS 0 +SWI/SNF_BAF60b 0 +SWI/SNF_SNF2 0 +SWI/SNF_SWI3 0 +TAFII250 0 +TEA 0 +TFb2 0 +tify 0 +TRAF 0 +Trihelix 0 +TUB 0 +ULT 0 +VARL 0 +VOZ 0 +Whirly 0 +WRKY 0 +Zinc finger, AN1 and A20 type 0 +Zinc finger, MIZ type 0 +Zinc finger, ZPR1 0 +Zn_clus 0 +ZPR 0 |
| b |
| diff -r 000000000000 -r 196795831b6a test-data/output.3.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/output.3.tsv Wed Feb 14 13:54:16 2024 +0000 |
| b |
| @@ -0,0 +1,10 @@ +sequence ID TAP family subfamily number of classifications domains +Ec-00_001160.1 FHA - 1 FHA; +Ec-00_001310.1 SET - 1 SET; +Ec-00_001360.1 bZIP - 2 bZIP_AUREO;bZIP_2;bZIP_CDD; +Ec-00_001700.1 RWP-RK RKD 1 RWP-RK; +Ec-00_001730.1 0_no_family_found - 0 WD40; +Ec-00_010160.1 0_no_family_found - 0 Response_reg; +Ec-01_004960.2 0_no_family_found - 0 K-box; +Ec-01_005390.1 0_no_family_found - 0 WD40; +Ec-01_005970.1 HAT GNAT 1 Acetyltransf_1; |
| b |
| diff -r 000000000000 -r 196795831b6a test-data/output.domtbl.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/output.domtbl.tsv Wed Feb 14 13:54:16 2024 +0000 |
| [ |
| @@ -0,0 +1,28 @@ +# --- full sequence --- -------------- this domain ------------- hmm coord ali coord env coord +# target name accession tlen query name accession qlen E-value score bias # of c-Evalue i-Evalue score bias from to from to from to acc description of target +#------------------- ---------- ----- -------------------- ---------- ----- --------- ------ ----- --- --- --------- --------- ------ ----- ----- ----- ----- ----- ----- ----- ---- --------------------- +Ec-01_005970.1 - 608 Acetyltransf_1 PF00583.26 117 1.6e-14 44.3 0.0 1 1 1.8e-15 2.8e-14 43.5 0.0 31 117 348 434 305 434 0.66 Acetyltransferase (GNAT) domain protein (608) ;mRNA; f:5180258-5197307 +Ec-00_001160.1 - 809 FHA PF00498.27 69 2.3e-19 59.7 0.2 1 1 2.4e-20 3.8e-19 59.0 0.2 2 69 123 190 122 190 0.96 Forkhead-associated (FHA) domain (809) ;mRNA; f:1452084-1459465 +Ec-01_004960.2 - 797 K-box PF01486.18 93 3.1e-06 17.5 4.8 1 1 3.7e-07 6e-06 16.6 4.8 11 85 683 754 674 756 0.81 Ankyrin repeat-containing domain (797) ;mRNA; f:4270310-4278760 +Ec-00_010160.1 - 978 Response_reg PF00072.25 112 1.2e-10 31.7 0.0 1 1 1.6e-11 2.6e-10 30.7 0.0 13 106 845 952 840 958 0.74 Phytochrome-like protein (978) ;mRNA; f:17223525-17226527 +Ec-00_001700.1 - 579 RWP-RK PF02042.16 49 2.1e-19 59.5 0.1 1 1 2.3e-20 3.7e-19 58.7 0.1 7 46 111 150 106 153 0.95 NIN-like transcription factor (579) ;mRNA; r:1923852-1928158 +Ec-00_001310.1 - 668 SET PF00856.29 169 4.1e-18 56.4 0.0 1 1 1.2e-18 2e-17 54.2 0.0 1 168 83 338 83 339 0.72 SET domain protein (668) ;mRNA; r:1563326-1571582 +Ec-00_001730.1 - 1089 WD40 PF00400.33 38 1.9e-38 120.1 18.0 1 4 3.9e-09 3.1e-08 24.5 0.2 5 38 579 611 575 611 0.86 WD40 repeat (1089) ;mRNA; r:1948051-1960039 +Ec-00_001730.1 - 1089 WD40 PF00400.33 38 1.9e-38 120.1 18.0 2 4 1e-11 8.1e-11 32.7 0.3 3 38 662 698 660 698 0.93 WD40 repeat (1089) ;mRNA; r:1948051-1960039 +Ec-00_001730.1 - 1089 WD40 PF00400.33 38 1.9e-38 120.1 18.0 3 4 9.6e-11 7.7e-10 29.6 0.1 8 38 716 747 709 747 0.88 WD40 repeat (1089) ;mRNA; r:1948051-1960039 +Ec-00_001730.1 - 1089 WD40 PF00400.33 38 1.9e-38 120.1 18.0 4 4 1.4e-05 0.00011 13.3 0.0 4 38 754 789 751 789 0.90 WD40 repeat (1089) ;mRNA; r:1948051-1960039 +Ec-01_005390.1 - 407 WD40 PF00400.33 38 3.8e-17 52.7 3.9 1 2 7.3e-06 5.8e-05 14.1 0.0 17 37 88 108 72 109 0.77 WD40 repeat (407) ;mRNA; f:4621750-4629304 +Ec-01_005390.1 - 407 WD40 PF00400.33 38 3.8e-17 52.7 3.9 2 2 1.8e-07 1.4e-06 19.2 0.1 8 38 223 255 217 255 0.89 WD40 repeat (407) ;mRNA; f:4621750-4629304 +Ec-00_001360.1 - 442 bZIP_2 - 55 9.2e-17 51.1 12.5 1 1 1e-17 1.6e-16 50.3 12.5 2 49 183 230 182 233 0.92 aureochrome 2 (442) ;mRNA; r:1593684-1598236 +Ec-00_001360.1 - 442 bZIP_CDD - 52 4.1e-13 39.4 7.8 1 1 4.4e-14 7.1e-13 38.6 7.8 2 43 188 229 187 232 0.94 aureochrome 2 (442) ;mRNA; r:1593684-1598236 +Ec-00_001360.1 - 442 bZIP_AUREO - 52 2.2e-22 69.4 6.2 1 1 2.7e-23 4.4e-22 68.5 6.2 1 51 187 237 187 238 0.97 aureochrome 2 (442) ;mRNA; r:1593684-1598236 +# +# Program: hmmsearch +# Version: 3.3.2 (Nov 2020) +# Pipeline mode: SEARCH +# Query file: /home/saskia/code/github/galaxyproject/tools-iuc/tools/tapscan/tapscan_domains_v12.txt +# Target file: /tmp/saskia/tmp4n93kzh1/files/b/7/b/dataset_b7b39fd6-41f1-440d-bfb7-da7a3a9f070e.dat +# Option settings: hmmsearch --domtblout domtblout.txt --cut_ga /home/saskia/code/github/galaxyproject/tools-iuc/tools/tapscan/tapscan_domains_v12.txt /tmp/saskia/tmp4n93kzh1/files/b/7/b/dataset_b7b39fd6-41f1-440d-bfb7-da7a3a9f070e.dat +# Current dir: /tmp/saskia/tmp4n93kzh1/job_working_directory/000/4/working +# Date: Mon Nov 13 14:57:28 2023 +# [ok] |