Repository 'agame_custom_tools'
hg clone https://toolshed.g2.bx.psu.edu/repos/matteoc/agame_custom_tools

Changeset 0:68a3648c7d91 (2016-12-22)
Commit message:
Uploaded
added:
annotate_ends/annotate_ends.xml
annotate_ends/attach.tags.pl
de_prokka/de_prokka.pl
de_prokka/de_prokka.xml
de_prokka/fas
de_prokka/p1
de_prokka/tbl
filter_Fb/filter.fb.pl
filter_Fb/filter.fb.xml
fosm_cluster/4mers.list
fosm_cluster/clusterF.xml
fosm_cluster/compute.stats.pl
fosm_cluster/kmeans.R
fosm_cluster/res.fasta
fosm_cluster/sim1_galaxy.fasta
fosm_cluster/sim1_galaxy.fasta.stats
fosm_cluster/test
mytrimmer/aaa.fplot
mytrimmer/aaa.gp
mytrimmer/aaa.png
mytrimmer/aaa.rplot
mytrimmer/f1
mytrimmer/f2
mytrimmer/out.fplot
mytrimmer/out.gp
mytrimmer/out.png
mytrimmer/out.rplot
mytrimmer/test.delta
mytrimmer/trim.seqs.C.cpp
mytrimmer/trimPE
mytrimmer/trimPE.xml
pfamScan/Bio/Pfam/Active_site/as_search.pm
pfamScan/Bio/Pfam/HMM/HMM.pm
pfamScan/Bio/Pfam/HMM/HMMIO.pm
pfamScan/Bio/Pfam/HMM/HMMMatch.pm
pfamScan/Bio/Pfam/HMM/HMMResults.pm
pfamScan/Bio/Pfam/HMM/HMMResultsIO.pm
pfamScan/Bio/Pfam/HMM/HMMSequence.pm
pfamScan/Bio/Pfam/HMM/HMMUnit.pm
pfamScan/Bio/Pfam/Scan/PfamScan.pm
pfamScan/Bio/Pfam/Scan/Seq.pm
pfamScan/htt.fas
pfamScan/pfamScan.xml
pfamScan/pfam_scan.pl
pfam_annot/annota.pl
pfam_annot/clans.txt
pfam_annot/f1
pfam_annot/pfamA.txt
pfam_annot/pfamA.txt.gz
pfam_annot/pfam_annot.xml
pfam_annot/pro
pfam_annot/prots
pfam_annot/prova
pfam_annot/prova2
pfam_annot/script.js
pfam_annot/table
pfam_search/annota.Filter.pl
pfam_search/clans.txt
pfam_search/f1
pfam_search/f2
pfam_search/lipase
pfam_search/lista
pfam_search/pfamA.txt
pfam_search/pfamA.txt.gz
pfam_search/pfam_filter.xml
pfam_search/prots
pfam_search/prova
pfam_search/prova2
pfam_search/search
pfam_search/table
b
diff -r 000000000000 -r 68a3648c7d91 annotate_ends/annotate_ends.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/annotate_ends/annotate_ends.xml Thu Dec 22 04:45:31 2016 -0500
b
@@ -0,0 +1,21 @@
+<tool id="ends_annot" name="Sanger Ends Attacher" version="0.">
+ <description> Attach fosmid ends name based on similarity  </description>
+ <command> /home/inmare/galaxy/tools/annotate_ends/attach.tags.pl $ends $fos $blast $minid $alnl $out $table</command>
+ <description> "approved by the boss" </description>
+ <inputs>
+  <param name="ends" type="data" format="fasta" label="multifasta containing the ends of the fosmids" help="fasta only"/>
+  <param name="fos" type="data" format="fasta" label="multifasta of the assembled fosmid" help="fasta only "/>
+  <param name="blast" type="data" format="tabular" label="blast output" help="12 column output only"/>
+  <!--<param name="minid" type="integer" label="minimum identity" value="95"  help="identity cutoff"/>
+        <param name="alnl" type="integer" label="minimum alignment length" value="200"  help="minimum alignment length"/>-->
+
+ </inputs>
+ <outputs>
+  <data name="out" format="fasta" label="decorated fosmid file"/>
+  <data name="table" format="tabular" label="conversion table"/>
+ </outputs>
+ <test/>
+ <help>
+When Sanger sequencing of the fosmid was performed, assembled fosmid might be assigned to their putative clones by sequence similarity. This tool is designed to assist in this process parsing blastN output files. In order for the tool to work properly Sanger ends need to be provided in a single fasta files, and sequences need to be named according to the following convention: "fosmid name" followed by "_" and "F" for forward or "R" for reverse. The fosmid fasta file needs to be used as a query and the search must be performed against a database containing all the contigs derived from the assembly of the fosmid. This tool requires the "standard" (12 column) output from blastN, any other format might cause major flaws.  The output consist in a new fasta file, where prefixes corresponding to fosmid names as provided in the input file are appended to contigs names. The prefix Unf (for unassigned fosmid) is appended to contigs showing no significant similarity to fosmids ends. Minimum alignment length and identity cut-off need to be provided in input, as a rule of thumb alignment length cutoff should be set to about half of the length of the Sanger sequences and identity cutoff should be above 90%.
+ </help>
+</tool>
b
diff -r 000000000000 -r 68a3648c7d91 annotate_ends/attach.tags.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/annotate_ends/attach.tags.pl Thu Dec 22 04:45:31 2016 -0500
[
@@ -0,0 +1,129 @@
+#!/usr/bin/perl -w
+use strict;
+
+my $fos_file=shift;
+my $fosS=read_fa($fos_file);
+
+my $cont_file=shift;
+my $contS=read_fa($cont_file);
+
+my $bfile=shift;
+my %best=();
+
+my $sim_coff=shift;
+my $len_coff=shift;
+
+my $out_name=shift;
+open(OUT,">$out_name");
+
+my $out_table=shift;
+open(TABLE,">$out_table");
+
+open(IN,$bfile);
+while(<IN>)
+{
+ next if $_=~/\#/;
+ my ($in,$node,$ident,$alnL,$rs,$re,$score)=(split(/\s+/))[0,1,2,3,8,9,11];
+ next unless $alnL>=$len_coff;
+ next unless $ident>=$sim_coff;
+ unless ($best{$in})
+ {
+ if ($re<$rs)
+ {
+ my $tm=$re;
+ $re=$rs;
+ $rs=$tm
+ }
+ $best{$in}=[$node,$score,$rs,$re,$alnL];
+ print TABLE "$in $node $rs $re\n";
+ }else{
+ next unless $score> $best{$in}[1];
+ $best{$in}=[$node,$score,$rs,$re,$alnL];
+ }
+
+}
+
+my %addT=();
+
+foreach my $best (keys %best)
+{
+ my $node=$best{$best}[0];
+ my $rs=$best{$best}[2];
+ my $re=$best{$best}[3];
+ my $alnL=$best{$best}[4];
+ my $relL=$alnL/length($fosS->{$best});
+ my $lseq=length($contS->{$node});
+ my $a=$lseq-$re;
+ print TABLE "Add $node $best $rs $a $relL\n";
+ if (($rs<=1500 || ($a)<=1500)) #&& $relL>0.35)
+ {
+ #print TABLE "Add $node $best $rs $a $relL\n";
+ push(@{$addT{$node}},$best);
+ }else{
+ #print TABLE "Discard $node $best $rs $a $relL\n";
+ }
+
+}
+my $unF=0;
+foreach my $seq (sort keys %$contS)
+{
+ my $tag="";
+ unless ($addT{$seq})
+ {
+ $unF++;
+ $tag="unf$unF";
+ }else{
+ my @adds=@{$addT{$seq}};
+ if ($#adds==1)
+ {
+ my $t=$adds[0];
+ $t=(split(/\_/,$t))[0];
+ $tag.=$t."_FR";
+ }else{
+ foreach my $t (@{$addT{$seq}})
+                        {
+                                $tag.="$t";
+                        }
+
+ }
+ }
+ my $SEQ=form($contS->{$seq},80);
+ print OUT ">$tag\n$SEQ\n";
+ print TABLE "$seq\t$tag\n";
+}
+
+
+
+sub read_fa
+{
+ my $file=$_[0];
+ my $seqF;
+ my $id="";
+ open(IN,$file);
+ while(<IN>)
+ {
+ chomp;
+ if ($_=~/^>(.*)/)
+ {
+ $id=$1;
+ $id=(split(/\s+/,$id))[0];
+ }else{
+ $seqF->{$id}.=$_;
+ }
+ }
+ return $seqF;
+}
+
+sub form
+{
+        my $string=$_[0];
+        my $len=$_[1];
+        my $outS="";
+        for (my $i=0;$i<=length($string);$i+=$len)
+        {
+                $outS.=substr($string,$i,$len)."\n";
+        }
+        #print "A:$outS";
+        #        #$outS=~s/\s+//g;
+        return $outS;
+}
b
diff -r 000000000000 -r 68a3648c7d91 de_prokka/de_prokka.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/de_prokka/de_prokka.pl Thu Dec 22 04:45:31 2016 -0500
[
@@ -0,0 +1,49 @@
+#!/usr/bin/perl -w
+use strict;
+my $fas_file=shift;
+my $ann_file=shift;
+my $out_file=shift;
+
+open(OUT,">$out_file");
+
+my %seqS=();
+my $id="";
+open(IN,$fas_file);
+while(<IN>)
+{
+ if ($_=~/^>(.*)/)
+ {
+ $id=$1;
+ $id=(split(/\s+/,$id))[0];
+ }else{
+ $seqS{$id}.=$_;
+ }
+}
+
+my %decode=();
+open(IN,$ann_file);
+while(<IN>)
+{
+ last if $_=~/^\#\#FASTA/;
+ next if $_=~/^\#/;
+ my ($id1,$id2)=(split(/\t/))[0,-1];
+ $id2=(split(/\;/,$id2))[0];
+ my @vl=(split(/\_/,$id1));
+ $id1="";
+ foreach my $v (@vl)
+ {
+ if ($v eq "length")
+ {
+ chop($id1);
+ last;
+ }
+ $id1.="$v\_"; 
+ }
+ $id2=~s/ID=//;
+ $decode{$id2}=$id1;
+}
+foreach my $seq (sort keys %seqS)
+{
+ print OUT ">$decode{$seq}\z\z\z$seq\n$seqS{$seq}\n";
+}
+
b
diff -r 000000000000 -r 68a3648c7d91 de_prokka/de_prokka.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/de_prokka/de_prokka.xml Thu Dec 22 04:45:31 2016 -0500
b
@@ -0,0 +1,15 @@
+<tool id="de_prokka" name="Prokka parser" version="0.">
+ <description>Parsing of Prokka output</description>
+ <command> /home/inmare/galaxy/tools/de_prokka/de_prokka.pl $prot $pfam $out</command>
+ <inputs>
+  <param name="prot" type="data" format="fasta" label="protein file by prokka" help="fasta only"/>
+  <param name="pfam" type="data" format="tabular" label="gff by prokka" help="this file should match the protein file"/>
+ </inputs>
+ <outputs>
+  <data name="out" ftype="data" format="fasta" label="annotated prokka file"/>
+ </outputs>
+ <test/>
+ <help>
+ This tool add information, derived from the gff file to the headers of a fasta files containing the sequences of proteins predicted by prokka. The operation is required in order to keep track of all the information related to the proteins and the scaffolds/contigs from which they are predicted, so that they are not lost in the functional annotation step. This tool is a simple parser based on regular expressions. Works only with Prokka version xxx. Any usage outside A-Game is discouraged
+ </help>
+</tool>
b
diff -r 000000000000 -r 68a3648c7d91 de_prokka/fas
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/de_prokka/fas Thu Dec 22 04:45:31 2016 -0500
b
b"@@ -0,0 +1,2151 @@\n+>PROKKA_00001 hypothetical protein\n+MMARPGRVTSHQASVMYARPSANMRPQVGWGGGTPMPRKESPLSQMMILATRTVPSTKSG\n+ERMLGRMWRRMMVRFRVPRLWAALTNSFSFRASTLPRTVRA*\n+>PROKKA_00002 Glutathione transport system permease protein GsiC\n+MGRYILKRLWHTVYVVVGISVIAFFFIHLSGDPVMLMLPADASHQEIEELRQQLGFNDPL\n+YVQYWRFATKAVQGDFGESLYYHVPAMELILERLPASLELALAAMAIALVVAIPLGILSA\n+VKRGSFIDLGSMLGALFGLSMPHFWLGIMLILLFSVKLGWLPTSGRGGWEHLIMPSLALG\n+MSLMAMFARLTRSVMLEVLGQDYVRTARAKGLKERLVIGKHALKNALIPLVTVAGMQFGF\n+LIGGTVIIETVFAWPGVGRLVVQAIFNRDYPLVQATVLVLAVLFVLVNLLVDLLYVYLDP\n+QISYLEEK*\n+>PROKKA_00003 Heme-binding protein A precursor\n+MKKFKRLCLALGVAALGLAILAGPALAKKDVLVVIQEAEPVGLDLMTSSIQTTMSVCYNI\n+HDTLFAPQEDASVKPRLAESWEKVDDLTWKIHLRRDATFHNGEPVNAQAVKFSFERSFKP\n+SIKNPHKGKLSAFKEVKVLDDYTLLISTKEPYAPGLYILGYYLPIVPPGYIKKVGDAKYN\n+TNPIGCGPYKLEKWVRGEEIVLTAYDKYYGPKPAFKKVIFKGVPEEASRIAALLTGEADV\n+ISGVSIHQRKRILASGKAYLTNQMGVMPYLGLNTYKPPFNDVRVRQAMNYAVNRELINKA\n+LFGGKAILCAGPISPRTFGHDPNLKPYPYDPAKAKKLLAEAGYPNGFQTRLAYPTYMSQI\n+QEQAEAIAADLAKVGVKVRLEPYERAVMWQRYKARKHAMYIYWWDDAPEPDRYMYSLFNS\n+KVRDYYYKNPEVDKLLDLGRTILDRKKRAEVYHKIDRLLYNDAPWVYLYVIPEVFAVSNQ\n+VAYQGRRDGFLDMRTAKPK*\n+>PROKKA_00004 Oligopeptide transport ATP-binding protein OppF\n+VAEVLRVKELVKHFPVRQGFFGRRQGVVHAVDGVSFTLEENQTLGLVGESGCGKSTIAFC\n+LLRLIDPTAGEVWFQGRNLAAAGSEELRRLRRDIQIVFQDPFGSLNPRMTVAQIVEEPLL\n+NHLELSAARRRELVAEGLSMVGLLPEHAQRFPHEFSGGQRQRICLARALVLRPKVIICDE\n+PVSALDVSVQAQVLNLLSRLQRQLGLSYIFVSHDLAVIRYVSQRVAVMYLGRIVEQAGVK\n+ELYARPMHPYTQALLSAVPVPNPRRRRRRIILEGDVPSPLDPPSGCHFHPRCGRAMEICR\n+HQAPELRPLADGHLVACHLYDEVRSAPGGTVEGG*\n+>PROKKA_00005 Oligopeptide transport ATP-binding protein OppD\n+VARAVDGVDLTVGRGEILGLVGESGCGKSALALSVLRLLPMPPAFFAGGQIRFKGRDLLK\n+MDPEELRRLRGNQISMIFQEPMTALNPVFTIGNQLGEVFRVHQGLARREARRRAVEMLEM\n+VGVPAPARRVREYPYQLSGGMRQRVMIAMALACRPALLLADEPTTALDVTIQAQILELIL\n+ELRDELGTAVVLITHDLGVVAETTERLAVMYTGRIVEQAPTVELFDHPLHPYTRGLLEAI\n+PSAEAELADKELHEIRGVVPSLLDLPPGCNFAPRCHLADERCARQEPELVEVRPGHRVAC\n+WRVDRG*\n+>PROKKA_00006 Primosomal protein N'\n+MTLMVDVALAAPLWQPLTYAVPAELAPLVKPLSRLLVPLRGGARLGFALGEPLAAGGGQD\n+ALKPVLDVLEDGKGPQVWPPELLPFFQRAAAYYHVPLGQVLAWCLPAGMGSARPAKALAP\n+KTQQVAVVSWRRGEDSRLPRPESQAARILRRLKARGPLPLPELREEFPRAAALCRDLEKR\n+GWVTISHRPLVKDLLGRPLLPEPEPEHYTPDQQRALDELLPAVHSGGFKSFLLHGVTGSG\n+KTELYMACVKAALEAGRTALLLTPEIGLCLRLEGLLRQRFGAGQVAVLHSGLSPAARRGQ\n+WLAIARGRARVVVGARSAVFAPLREPGVICVDEEQDEAYKQEDRFRYHARDLALLRGREQ\n+DCPVVLGTATPAVTTYHRAQEGNTVCLRLPRRVREAPLPRMELVDLRREGRLVGGFLSRR\n+LLAALEQTLEAGEQAILFLNRRGFAPAYLCTACGQTVGCPACAVSLTLHQGSDRLVCHVC\n+GHQRPRPRTCPACGAGEEKLRPLGLGTEAVAQKLGELLPGARIARLDRDTAGDPRRLGEL\n+LRAIAERRVEVVVGTQMITKGHDFPGIGLVGVLSADQALALPDFRAGERAYGLLTQVAGR\n+AGRQGGKSRVIVQAYDPDHHALRAALAQRPDEFYQTELAERRALGYPPFMRLVALRLEAV\n+DDRRCQRAAQALAAGLEEARRRLEPGARVLGPAPAALPRAKARHRWMILLKAPTAAAAGR\n+TLRLGLHRSPPLPAGVRLLVDVDPVSLI*\n+>PROKKA_00007 HTH-type transcriptional regulator IscR\n+LALHTMAYLAAHPGRLISNRVIARDLGVSAAHLSKVLQRLARAGLLESLRGPTGGFRLGR\n+PAGEISLMEVYEAIDGKFQPSSCLLGRPVCRGGKCVLGELGRNLERQTREYLLNTKLSEF\n+EDFMCFEEGN*\n+>PROKKA_00008 Anaerobic sulfite reductase subunit C\n+MPLGKGRGGALPWDPRAEAALGRVPFFVRSLVRRKVEERVAEAGGRRVGLEDFQEAEAAF\n+RAVRAGKSQKELEAMLPAENRPGVEMVVVQACRSRLSNCPNPLIDTQKWLERVQAWVEEL\n+DLSERLRRRVADDKILFHHKLKIAIAGCPNGCSRPQIADLALVGMTRPRLVEPEVCTACG\n+ACAEACPDGAVSQDDGPPEFHRELCQGCLSCSRACPVGGIELDPPGVRVLMAGKLGRHPH\n+LARPVMEATGPEPVLAYWTRELEEYLASAPPGRRFSAWWLEQHPAG*\n+>PROKKA_00009 hypothetical protein\n+MPIPGRLLTTAMAVMPHTDVDQALASALSLDIPFWPQLPRVNYYEDMYVQASEHFPGMVV\n+DHKERTLVFSMDKFMVELEETLAHLEEPEYFDISPEYSVVYHRFLELELADRPAIRGQLE\n+GPISFGLNVKDQDDRPILFDDTVRPFLLEVMARRVNVQLTRLKARNPNAFMFVDEPGLQF\n+IFSGLSGYSDRKAKEDLDQFFAAIERPRGIHLCGNPDWDFLLNLDLDILSLDVYSNGEVF\n+SSYARSIKRFLDRGGVLAWGLVPTNFEPFSAEDHVSLKARLKEIWSALESKGVDRELMLE\n+RSLLSPATCCLVNPDGEKTVDKAFALVRALSAELRDEYGLDG*\n+>PROKKA_00010 diguanylate cyclase\n+VEVDISLFSNQNQFVILRVGEQAYALPAAQVREMQVLPEVTEVPRAPAHLRGIISPRGEV\n+LPLFDLRRRLGMRSLAEEADELLKILEAREQEHKQWLEELESCIREEREFTLPTDPEKCA\n+FGQWYQNFTTEDLALASVLERLAAPHRRVHEVAGAALEALEKEGQAAAQEVIDRARRIIL\n+PKLLELFAELKRLIRETHQEIAVILESGRHTLALAVDNVDSVELLQPKDLQNLERFGPVD\n+GSQDLLESVGRRANGETVYILKTAEFFQAATDLTF*\n+>PROKKA_00011 putative pho"..b'KVVGIGQKVRFNQTINADKGVIISEIDPQSSLAGIGVRPGDVIR\n+KVDAEATNTVQSFYKAMIKDRWKQSIVILLQRGDQGYYITLKLS*\n+>PROKKA_00312 hypothetical protein\n+MFQFLLFVGSITAFIIGGLIVLIGIGAITGCAGGILAMCSGAIIAVLGAWSAITFFLPSP\n+DPSVPARETINLIRRNGRWM*\n+>PROKKA_00313 PHB/PHA accumulation regulator DNA-binding domain protein\n+MHLVKKYANRKLYDTTDKQYITMEKLAELIKSGSEVMIIDNETGDDLTAQVVSQLLAREK\n+NEDDTALPSSVLMQMLRKGRGTLFGYGKKYISLWQSAVLMSRDENEKLINTLVKDKELSE\n+TEGRTLKKEITAYTNGLKTWIRENIDQRVNEALNMMNLASKEQVKELIDQVESLSLKVQS\n+LEREIRRKN*\n+>PROKKA_00314 Putative diacyglycerol O-acyltransferase/MT1809\n+MRKQVEIMSSIDNFWLYMDHPTNLMIITGFLQFDKPINFERLKQTIKNRLLCYDRFKKRV\n+IRPMTGVGNATWELDPRFDLRSHLHRVALPAPGDKETLQELISDLTATPLDPTKPLWQLH\n+YIENCENGGSVLFARIHHCIGDGISLIRLLLSLTDTEPNAVWSDCLNEPKIEKETSFNLF\n+PPLESAMKKVTRARRRAQKVTRFVSREIEKSFSNPYHIVKRTRTVTKFALDVATVMSKIL\n+LLPADRKTVFKGELGVRKSVAWSDPLPLDDIKVIGKYFNATINDILVALVTGALRRYLQQ\n+CNNLVGDLDIRVAMPINIRPIDGDIELGNQFSLILVALPVHIDDPVLRIREVQRRINDLK\n+EAPDAAVAYAVLNALGVSSAKLAKTAATMFANKTTGVFSNVPGPRQQLYFCGEKINNIMF\n+WVPRIGGLGIGISIISYNNEVSLGIATDSGLVQDPKAILDHFANEFRMLLGMYKAGQMEK\n+EPLVINDRSVEPPVFAFNTEKIASVQAIRCKAITRSGTQCHNRAATNSMYCTLHLSKYET\n+IASREENDMPAEADNTLPAEDQAAG*\n+>PROKKA_00315 cell division inhibitor MinD\n+MNAPAPTPNNPRIVVCCGSGGVGKTTISAAIGLCGALMGKKTVVLTIDPARRLADALGIS\n+ALNMEAQRVPLEASVPASGELYAMMVDAKRTFDRLIGRYSSAGLRDRILENRYYQHVSNN\n+MAGSHEYMAMERLYEIYHEKRFDLIVLDTPPSRRALDFLEAPQRVINLLGHPYFLKLFKP\n+YIKAGQLSGRLFNLLAMPVLRAVGQVVGGQTISDIFSFFQLFNDMLFDGFSKRASAVESL\n+LSDPMTTFFAVTTPQEYPIQEATYLFRQLQQRNMPFGGFIVNRVHSDTADSPFDSEAADR\n+KRVLMEKIADKPIFQRLEIADRMDRKLARSDAAAIDRISSISPGLAVFPILFADETVNDI\n+SGLRVISTQLMKHPEFKI*\n+>PROKKA_00316 Arsenical pump-driving ATPase\n+MAKADESDRTQLEALFSIGAGVRSLDELLTRRLIFLMGKGGVGKTTLSVALALTAEMMGK\n+RVLLTEIGDSQGIGRYFDAQPDVRPRQVSSAIWAARVDPKDELTAYLHYHMKSGFIANRI\n+TQSRLFDYLLAATPGLKEIMTLARIWRWEKAKNKAGTPLYDTIIVDAPATGHGLSLLRLP\n+KMLVEMIRVGPIASQVNGVQQMLLNPERTALTLVTLPEELPVNETREMIDIAVDEVGIPV\n+QAVFINGVHPVFVTPDEFSRIQELDRDCPDADPDCPDLRFALDVARRQIVRNAAQQVQMN\n+EVHAAAPGHVIHVPYYYTNDLGPEEIRTIAASLHRQISEAPRGGGR*\n+>PROKKA_00317 PHB/PHA accumulation regulator DNA-binding domain protein\n+MYQIKRYANGRFYDTVEKNYVTREQISKLLGAGKKISIIDTRTEKDITDDIVSRIKAKKQ\n+NPSKSKKAGKSNKAVDDSTGMLVQLFRKGGDALFDYGKRYASMWQNMVTMSRDEVDKLVN\n+MLVKDNKLTELEGSKLKKEIDRYRTNIQGWITRNIDNRVNEVLNRMNLANRDQILELTGK\n+IEELNKRINRLGKEKKGPAKTKKTS*\n+>PROKKA_00318 hypothetical protein\n+MAKTATKKGETAQTKITGKIQKAAESVTDKVKGYNEKYVAKNIEKGKATLKEYNEKYLVK\n+TVEKGKDTLKEYNDKYITKAVEKGRSYVDGPYKKLSGTMDQWLEKGRSFEKDAWKKMDGY\n+VENGKKFMYKLPLVETVEKKVTSSLNSVPSVVNLPGKGDIEKLTLAMEALNSNIEALRKQ\n+SAQ*\n+>PROKKA_00319 hypothetical protein\n+MGKTIRRALVLSGGGARGAFEVGVMRYLNEVNWQPDLICGTSIGAINGAAFGSGMSVDEL\n+AHLWKTYHRKQMYKITFPAFFRTLLSGRKFSPLSDNRPTRSLLEKTIDIDALRNSTTEII\n+ISVLNMRTSQVRYFTHKAIGIEHLMAAGGIPMMFPWQYIDGDPYWDAGVMVNTPIMPAFE\n+RGATEIIVVLLSPLGAIPQRLPSTHREVSELVFEQFLIGSYTACLPNAGWRTNPEADVYD\n+TPLPDSPQLQLSMKGVRMATVYPTRMLGFRSLLDFSPRQAKTLLRDGYVNARMQLKSFFK\n+*\n+>PROKKA_00320 hypothetical protein\n+MPTATIRQQLIELLSENKYDARDLSQRLGVRETVVYDSIPHITRSVTSMGKKLKIVPSRC\n+TSCGYTFKDRKRAAKPSRCPTCKSERIAKPKFYIV*\n+>PROKKA_00321 3-dehydroquinate synthase\n+VKTVSISGQTGASKIVIGERLENLSNYLPDRRIVVITDTNVAGHYGKMFPDVEVITIGCG\n+ESIKTLDTAKMIYERLVSMAADRSVFIVGIGGGIVCDITGFIASTYMRGVRFGYVATTLL\n+AQVDASVGGKNGVNFMGYKNMVGVFNQPEFVICDPYVLGTLPPRELACGFAEIVKHAAIS\n+DKDYFADLEESHEKACARDPETLERIIRKSVVIKAGVVNADEKERGERRKLNFGHTLGHA\n+IEKTLGVPHGEAVSAGMVMAAELSANRGHLPRPDIRRLKDLLTHLDLPTALPIDPERIID\n+AMARDKKRQGEKIHFVLLSAIGAAFVDSISLAELEAVVTG*\n+>PROKKA_00322 Phospho-2-dehydro-3-deoxyheptonate aldolase\n+MLIVMRQDASREQIDAVIRAIEARGYTARSIPGGDRVSIGILNNRTAIDAAWFQDMPGVK\n+ETIPVTRPYKLVSREIQPHDTIIRVGGVEIGNGHLVIIGGPCAVESEAQVMATAERVKKA\n+GADIFRGGAFKPRTSPYAFQGLGEEGLKILARAREQFGMPIVTEVMDLEYFDMVEAYADI\n+VQIGTRNMQNFSLLRRAGESKKPILLKRGMSATIDEWLMAAEYVLSQGNPNIILCERGVR\n+TFVRHSRNTLDLSAIPVVQRESHLPIIVDPSHATGFRDQVIPLSRAAAAARAHGLMIEVH\n+NAPDTAQCDGSQSLYPDQFETLCRQVRSIFRILGETDETR*\n+>PROKKA_00323 hypothetical protein\n+MILIDILRRNWYDLLPLNEIGNICAVKFGNKAADP*\n+>PROKKA_00324 Fatty acid metabolism regulator protein\n+MTNATNDQPFRPARFTEQRLITAILDGTCPPGSVLPAERRLAEQFGVTRPTIRETLQRLA\n+AEGWITIRHGKPTRVNDFWETGGCS*\n'
b
diff -r 000000000000 -r 68a3648c7d91 de_prokka/p1
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/de_prokka/p1 Thu Dec 22 04:45:31 2016 -0500
b
b'@@ -0,0 +1,2474 @@\n+>NODE_25#PROKKA_00102\n+VVMEEYQKTLARLLAESGGLFFQEGLRLKDGRPTPYFVNLGVFRTGRLALELGRCFSLWI\n+HHHGLDQDLDCIVGPSYKGSAIAQATAIALYELHGKEVAYDYDRKEAKTHGEATGHGYLF\n+VTGAALQGGKVLIIDDVGTSMSTKLELLKKLSWLKPRLERPMELLGVVLAVDREQTQAVY\n+DAQGRVREGVRGPDAMESFRQESGLEVWSLLGIRQALDYLYKEGIPVLIQGEMRPLDELT\n+MQIAREYLELYGREEA*\n+\n+>NODE_17#PROKKA_00047\n+MQVVQAILSKVHEAGDIYFAKYGGHYCVGCERFLTEHEMIDGKCPDHGTEPVYQEEENYF\n+FRMSDYTQPLKEYIRANPDFIRPERYKNEVLAILDQGLEDLCISRPKTRLTWGIELPFDQ\n+NFVTYVWFDALINYLSGLDWPDGELLERFWTGPKADPQHLIAKDILKPHGIYWPTMLMAL\n+AKAEGRPLDHYLYRHLNVHGYWQVGEGKMSKSRGNVVKPLDLAGIYGVDPFRYFLLREMT\n+FGLDASFSEDLLVERYNADLANDLGNLFSRVLNMLSRYRDGLLPELHPQELTEADREMKG\n+ALAASLGPGAEHDFQAQVREFRFHTALADLWSQVRRANKYIVAREPWVMAKDPDRAAELD\n+NVLYILVQLLASVTHLAWPVMPATAEKMAAMLGLELVVPVDWQRLFALELMTPGAKAEKP\n+QALFPRVQTDKVKAKAARKEAKQAQQQPAAKGGGKQKAKPQDKAGLITIDEFAKVELRLG\n+RVLEAGAIKGADKLLKLKVDLGEPEPRQIVAGIARHYRPEELVGRQVVVVANLKPAKLRG\n+EISQGMVLACVAEGRVRLVAPEEELPPGSVVR*\n+\n+>NODE_3#PROKKA_00175\n+VLACFYFFADSPMLNWSQIDTALLDMDGTLLDLHFDSHFWLEHLPRRYAELKHLDPEHAR\n+QSLLSKIEQLRGKLDWYCIDFWSDLLDLDVVALKRETRDRIAWRPHSKAFLERLRACGIR\n+RVLVTNSHPDGLNLKIETTGIDQHLDRLFSSHSFGQPKEGPDFWEQLAQQEPFDPERTLL\n+IDDSLPVLESARRYGIRHLLAILSPDSQQPPRQPSHHPCVHDFDELFQSLDQFAHQKNRI\n+DGLSD*\n+\n+>NODE_8#PROKKA_00285\n+MAQPYEGLVGTSGRVQVNRNGRPAFSISTGAFADGWRGASLTPAKVGETTDGVCLGKISL\n+PDKLTIASALRATAAGKAMELRYTLTPKADAKLNSLHVSFGLPASFLKGASYTIEGETKE\n+VPAVLGATHLRAGDHVPSVRFTWPNGDWLQVDILSKTPVLFQDNRQWGDSFDLRLGPQMV\n+PAQTLPANQPVEIAMRVSAKDGMKLDFDRPVTITAGKDWVPLDLELDIEPGSALDFSGLG\n+QFDAPSGKHGWLQATPDGKFAFADSLDTPRRFYGVNLCFTAQYLSHDEAERLAERFLRLG\n+YNTVRFHHHEYPLIDRKNGCSTDLKPESIDQLDYLFAQFKKRGIYVTTDCYVSRPVYASE\n+IWDGAKGNVEMNEFKMLVPVNERAFENWKTYNRNFLTHRNPYTGMRYADDPTLAWLSMIN\n+EANFGNYIRSVSDRARPDWERAWGAWLKARYGSAEAITKAWGSTFDGDLSKPTAKLAKSF\n+TDDNRQSRDFAVFLADTERTMFLKMKKFLREEIGTKAMLTNMNGWTNTPQSQLARAEFDY\n+VDDHFYVDHPQFIEKSWRLPSRCPNTSPVLAGAPGGRGTAFNRLMNKPFTISEYNYSGPG\n+RYRGVGGILTGCMAALQDWSVVWRFAYSHRRENVLKPSTAGYFDMATDPLNQAAERASMC\n+LFLRGDLDPAPRSAAITLNPETLEKGDSHQGRTPPSWDELVPVIQVGTFLGDRQSKVPAD\n+IALPTTDAAPAAADVVMPKPYDSGKGSAILKELRAKGWLDAANKTDLDRKRGQSASDQFL\n+MDGEKDMMVLDTPRTAGGYAEAGQTIHTQAADFSILDTGATVWISSLDKQPITSSKRLLL\n+THLTDLQNTEVRYAERGRKTLLAWGKLPHLVRVGEAKISLHRSGAKLPKVYVLATSGHRL\n+GEVPVTKGKNGTLELAISTKGEAGAQLMYELDFR*\n+\n+>NODE_3#PROKKA_00181\n+MRWLSHSAFLFSHGRSAVKALSLLLLFSLGGCSAVNNMMYKTTGEVMVGYAKAHAVPYVL\n+SSDDLGMSCAMSEALTPLLMSFGQVTAKPDQLGVMMQMSAGTCAEEKGWNAELAYMKELR\n+NQHPQNAEDDMIVEKRHYIEAADRYYSAWKHLVAYYGDPSTGQCPTFKNDEGQFIYMAGL\n+LAGVQALAAEIQSTSDEGVPKNIGSTVAQASGCLSDDKWWGVPMALRATVWSMIPGAKPD\n+GENPFQRLDESDKKANKARVRLAYVLHVIAAWNKGDTKLVKKLIREQQAQEAKYPADPRW\n+KMIDKLSTLYLRSISDRMWVEHTGHRTPIGGLGTFWDDSKGSGEVIDLDSVM*\n+\n+>NODE_3#PROKKA_00179\n+MSEAIEQSSMYLQKRTIGGRSAREWFSSLPACILLMAVVLFTTSSDIHNKALQLGQVLWS\n+GYYKLRVDPVKPDCNPNVNVDAQVKRQIAAQAAQQDSMLGSLVGSSPVNPAAVRQSVINA\n+KQACEAQFADYNATKGRITEGVRVYRSVELFISDVVAFGLASQRYILALLVLVCAATATF\n+SRHHIAMRGMETRLDHIVSHFMQFIANTMLLISSFMYRQMSHNSGAVVTTGQEISHDIWI\n+AGFLLLTIVSLVQLFRVPEDAEEGGTLGHAFLCVPLYTTMCLISGTFFAFVGSPAGIGIY\n+LDKMMELADQFLNVGLYVWAGMMLKQTRLASLVFNVLRPLKLPPELLAVVAVMVAAVPTA\n+YTGASGIFVIAAGAVIYSEMRKAGARRQLALASTAMSGSLGVVLNPCLLVVVIAYLNREV\n+TTDSLFHWGGWVFLLTSTLFLITSLVVNRQKGFKVAPMNEALPEMVMRLKPLIPYVLVIA\n+GVVFFYWLLLGVTMNEFSAPRILPIIMVGILVYEHVHFRGDRNKVSGEVDHQGLEKSLRT\n+ATSETTAEIGALLLLFGLSVSIGGVIERSQVMSLFPQALPSPWLAMMLMVVILVILGMIM\n+DPFGAVILVSATIADLAYQSGIAPVHFWMVTLVAFELGYLSPPVALNHLLTRQVVGESEM\n+NLSYRESGSFYQRHERVLMPLLVMGSALLIVAFVPLLFYAR*\n+\n+>NODE_9#PROKKA_00319\n+MGKTIRRALVLSGGGARGAFEVGVMRYLNEVNWQPDLICGTSIGAINGAAFGSGMSVDEL\n+AHLWKTYHRKQMYKITFPAFFRTLLSGRKFSPLSDNRPTRSLLEKTIDIDALRNSTTEII\n+ISVLNMRTSQVRYFTHKAIGIEHLMAAGGIPMMFPWQYIDGDPYWDAGVMVNTPIMPAFE\n+RGATEIIVVLLSPLGAIPQRLPSTHREVSELVFEQFLIGSYTACLPNAGWRTNPEADVYD\n+TPLPDSPQLQLSMKGVRMATVYPTRMLGFRSLLDFSPRQAKTLLRDGYVNARMQLKSFFK\n+*\n+\n+>NODE_8#PROKKA_00289\n+MSISIASQPGLTWHPAYCRPRTEKVVDDYCKRHDIPCYLPLLRQRKRYQRRTVETYLPMF\n+PGYVFVQLGPDTRTTFLECHRIVHIVEVREAQERTLVAELTELQHLETAQATVDLEVMPD\n+IKPGTQVTITDGPLAGITGVVEKRKGKTRVTVNVELVGRSVVAEMDLGELELDGDA*\n+\n+>NODE_7#PROKKA_00260\n+MQVYPKFLLSATAPAHFPPPSAPEFAFLGRSNVGKSSLINALLGSRQAKVSSTPGRTRAI\n+NFFSLTTSPNRQQPNFLFADLPGYGYAKISK'..b'+\n+>NODE_2#PROKKA_00144\n+MEFKFEKLIIWQKAMEFGEEINSIAHKFPKDEVYNLSSQIRRAVDSIALNISEGSIGQSN\n+LEFKKFMSYAIRSLAEVVSCLHKAKRRNYITEDEFKKQYEFAYNLMNMMVAFREKIK*\n+\n+>NODE_2#PROKKA_00141\n+MNIEAFYSLSYGLYIIGTASKGKKNGYVANTAFQVTASPEQIAISCNKDNLSEQMIDESG\n+YFSLSVLEKDASKEIINRFGYKSGKTLDKFEGTKYFETNNGIPVVTEECVAWFECKVEQK\n+VDVGTHIIFIGRVLNGEYLDENKESLTYTYYRQVRHGLSPKNSPTYVDKSLLPEKEKKEE\n+KAEETPAEKPKGKSMQKWECIVCGHIYDPAVGDPEQNIPPGTAFEDLPDDWVCPDCGAEK\n+EDFEPIG*\n+\n+>NODE_2#PROKKA_00118\n+VKWDDNWALKLLIQNISLQAIFYQPTISFNLGNKFGVGAGLVYATGNVKMNSALNYSGNS\n+GFNLNGKTHNFGFNVGVHYKISDQWSLGATYRSEIKMNVKNGNAAFFVPGSLSSIIPPSN\n+HFSASLPLPANFDFGVAYQATKKLLLAAELDWVRWSVYDSLSFHFATNPQLLNNSSPKLY\n+KDQWIPRIGAQYQVSKKLMVRAGAYYELSPANVSGYYKTNTVVPGIGINYHF*\n+\n+>NODE_10#PROKKA_00002\n+MGRYILKRLWHTVYVVVGISVIAFFFIHLSGDPVMLMLPADASHQEIEELRQQLGFNDPL\n+YVQYWRFATKAVQGDFGESLYYHVPAMELILERLPASLELALAAMAIALVVAIPLGILSA\n+VKRGSFIDLGSMLGALFGLSMPHFWLGIMLILLFSVKLGWLPTSGRGGWEHLIMPSLALG\n+MSLMAMFARLTRSVMLEVLGQDYVRTARAKGLKERLVIGKHALKNALIPLVTVAGMQFGF\n+LIGGTVIIETVFAWPGVGRLVVQAIFNRDYPLVQATVLVLAVLFVLVNLLVDLLYVYLDP\n+QISYLEEK*\n+\n+>NODE_9#PROKKA_00313\n+MHLVKKYANRKLYDTTDKQYITMEKLAELIKSGSEVMIIDNETGDDLTAQVVSQLLAREK\n+NEDDTALPSSVLMQMLRKGRGTLFGYGKKYISLWQSAVLMSRDENEKLINTLVKDKELSE\n+TEGRTLKKEITAYTNGLKTWIRENIDQRVNEALNMMNLASKEQVKELIDQVESLSLKVQS\n+LEREIRRKN*\n+\n+>NODE_7#PROKKA_00278\n+VNVLRKSFVVAVVLLSFGASAAVAQTTVAASVYGAFRSSTRTGGISNFTVENPSNAAGFL\n+LELRHISNPLMGYELTYSYHRANEAYSNTLKVLCPISPGGSCPEQITTAGVSANAQEVTG\n+DWVVSFPLANLKPFVLVGGGVIVTSPATGSVTATITDFDPVTNMMSQTTSSMPTQTQTKG\n+VFQYGAGLDWTVLPHIGLRFQYRGNVYKAANLTKVFTSTDKFTQTAEPVVGVFFRF*\n+\n+>NODE_3#PROKKA_00158\n+MLSRAIMAPMDLVLDTAEAQRMALFFDHILIWKLSRRTFNKEDNQRYSSELRYLRERGVA\n+LLCGLDIPNLISFGRADGTTWNPMEEMKKDCDLLLPFQVGTGVPDQAENEAHADRLIRHL\n+SSRLMYNDKPVVAHAEAVNLNTQGNELNALEITINNIPMPPENIPWEDLIQFRNEEETVA\n+KLRALRIWLKDRSSAGQSPREIQEELEHLLYEYRKYMEIQHKKFRQGILSTLISSTPEIV\n+ASVATLNFGAAIKSVFDIKGRYLGLSEAELSAPGREVSYIAKARDFLTS*\n+\n+>NODE_11#PROKKA_00020\n+MKLVVFLKQVPGVTEIPWDPASGHLRREKAPGMMNPACRHALEAALILKEQHGGELTAIS\n+MGPPAAEEILREALALGADRAVLLSDPRLAGADTPATSYTLSLAVRAVCPDCDLLLLGNQ\n+TSDSETGQVGPHLAEELDLPSAINVEELELDGEVLRVKRLCDNFLETLEMDLPALVTINT\n+QGHPPRQVPLGGVEDAFSRGEFLVLNAEDLKADLARVGMTGSAGRIVKVYPAGGERKGEL\n+IKGAPKRCVLELLERHGDLLGGYLRKDLGGGR*\n+\n+>NODE_13#PROKKA_00038\n+MELTNDEREFFALVNRASLLNPFSDERNDVDLKLAGLPSAAPGTGRVKKAIQSVNERIRQ\n+LETDGRADISQYTGRDRELVEKAFLFELFYRFRKQFDELIESQIASDDVPARIPFYNDAF\n+SAMQKRGFTEEDFRRYFALAFQIRRAFYFIGRSLVGNSASMKSLRLNLWNNVFTHNMDLY\n+DRYLWNRMEDYSTLILGETGTGKGAAALAIGRSGFIPLKKKSFEESFTRSFISLNLSQFP\n+ETLIESALFGHKKGAFTGAIENYQGIFEQCSPYGAILLDEIGEVSKPIQIKLLQVIQDRV\n+FTPVGSQTRSRFNGRVIAATNRPLETLRGKGFFRDDFYYRLCSDIIVVPPLRQRVQEDPT\n+ELDVLLDFTINRLVGRSSPELVQIVREVIDRHLGNDYPWPGNVRELEQCVRRVLLKGIYT\n+GDAAVADIDLCRSLTTGIEQGNIDANSLTSGYCYLLYQRHRTFEEVARRTGLDRRTVKKY\n+IQDWTSSHSTDNPPETDIPG*\n+\n+>NODE_1#PROKKA_00055\n+MEISVSEFLKPRIAGLTELGENRTRIVLEPLERGFGYTLGNSLRRVLLSSMPGAAVVEAE\n+IDGVLHEYTAIDGVQEDVVEILLNLKLLAIRMHAREEATLTLNATGAGVVTAGDIQVDHD\n+VEIVNKDLVIAHLAKNGKLSVRLKVMRGRGYMPVVKRYADESQGRKIGKLKLDATFTPIR\n+RVAYYVEAARVEQRTDLDKLILDIETNGTIGAEEALRRAAGILTDQLSVFADLSSVSSHT\n+PTESRSVKPILLKPVEELELTVRSSNALKAERIRFVGDLVQKSEDELLKTPNLGRKSLTE\n+IKDVLARHELALGMKLEDWPPAALAERRAS*\n+\n+>NODE_10#PROKKA_00007\n+LALHTMAYLAAHPGRLISNRVIARDLGVSAAHLSKVLQRLARAGLLESLRGPTGGFRLGR\n+PAGEISLMEVYEAIDGKFQPSSCLLGRPVCRGGKCVLGELGRNLERQTREYLLNTKLSEF\n+EDFMCFEEGN*\n+\n+>NODE_2#PROKKA_00147\n+MDAGEEKASPKKTILIAEDDETSFFFLKFVLAKENVNILYAQSGQEAVDICEAHPEIDLI\n+LMDIKMAGMSGIEATQLIKKRNPRVPVIAQTAFALSSDKENILKAGCDDYITKPIRKEEL\n+LEKVNFFLYSKKES*\n+\n+>NODE_1#PROKKA_00058\n+MNPYERYLLPWLIDAVCALPAAARERAKIVPRARGEVLEIGIGTGHNLPYYAPRRVAGVT\n+GIDPGVLRRRIMRRAHAAGIEVKLLSLSAESIPAEDASFDTLVSTFTLCSIPDVERALAE\n+MRRVLKPTGRLLYLEHGTAPDPRVRRWQDRLTPWWKPLAGGCHLNRDIPRLITGAGFDIV\n+EQHSEYIRGPRILSYVFRGEAQPIAVAGSK*\n+\n+>NODE_3#PROKKA_00190\n+MSVKLGIVMDPIGAIHYKKDTSLAMLLAAQRRGWELHYMEMQDLYLRDGEPRARTQALTV\n+AANPDDWYSLGEPSDRALASLDVILMRKDPPVDKEFLVTTWMLEAAERLGTLVVNPPQAL\n+RDCNEKLFATWFPQCTPPLVVSRDAARLRAFHAEHGDVVLKPLDEMGGRSIFRVREDGDN\n+LGVIIETLTKDGSHQIMAQKYLPEITQGDKRILLVDGEPVPYALARIPSQGEHRGNLAAG\n+GRGEGRLLTDRDRWIVEQVQPMVREKGLLFVGLDVIGDYLTEINVTSPTCVRELDREYDL\n+DISDQLMQVIADRLARR*\n+\n'
b
diff -r 000000000000 -r 68a3648c7d91 de_prokka/tbl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/de_prokka/tbl Thu Dec 22 04:45:31 2016 -0500
b
b"@@ -0,0 +1,6309 @@\n+##gff-version 3\n+##sequence-region NODE_10_length_13322_cov_17.9253_ID_19 1 13322\n+##sequence-region NODE_11_length_12385_cov_59.906_ID_21 1 12385\n+##sequence-region NODE_12_length_10380_cov_58.6117_ID_23 1 10380\n+##sequence-region NODE_13_length_9239_cov_583.686_ID_25 1 9239\n+##sequence-region NODE_17_length_5536_cov_76.8441_ID_33 1 5536\n+##sequence-region NODE_1_length_50014_cov_374.697_ID_1 1 50014\n+##sequence-region NODE_21_length_3078_cov_41.6478_ID_41 1 3078\n+##sequence-region NODE_22_length_2891_cov_11.4392_ID_43 1 2891\n+##sequence-region NODE_25_length_2759_cov_91.0869_ID_49 1 2759\n+##sequence-region NODE_29_length_2081_cov_1172.16_ID_57 1 2081\n+##sequence-region NODE_2_length_41759_cov_213.726_ID_3 1 41759\n+##sequence-region NODE_39_length_1472_cov_66.0355_ID_77 1 1472\n+##sequence-region NODE_3_length_39215_cov_155.142_ID_5 1 39215\n+##sequence-region NODE_42_length_1523_cov_23.4108_ID_83 1 1523\n+##sequence-region NODE_4_length_32829_cov_185.347_ID_7 1 32829\n+##sequence-region NODE_50_length_1320_cov_1878.61_ID_99 1 1320\n+##sequence-region NODE_5_length_30845_cov_61.6996_ID_9 1 30845\n+##sequence-region NODE_6_length_29555_cov_90.739_ID_11 1 29555\n+##sequence-region NODE_7_length_23944_cov_113.632_ID_13 1 23944\n+##sequence-region NODE_8_length_21143_cov_115.55_ID_15 1 21143\n+##sequence-region NODE_9_length_20531_cov_182.121_ID_17 1 20531\n+NODE_10_length_13322_cov_17.9253_ID_19\tProdigal:2.60\tCDS\t51\t356\t.\t+\t0\tID=PROKKA_00001;inference=ab initio prediction:Prodigal:2.60;locus_tag=PROKKA_00001;product=hypothetical protein\n+NODE_10_length_13322_cov_17.9253_ID_19\tProdigal:2.60\tCDS\t865\t1791\t.\t-\t0\tID=PROKKA_00002;gene=gsiC;inference=ab initio prediction:Prodigal:2.60,similar to AA sequence:UniProtKB:P75798;locus_tag=PROKKA_00002;product=Glutathione transport system permease protein GsiC\n+NODE_10_length_13322_cov_17.9253_ID_19\tProdigal:2.60\tCDS\t1853\t3352\t.\t-\t0\tID=PROKKA_00003;gene=hbpA;inference=ab initio prediction:Prodigal:2.60,similar to AA sequence:UniProtKB:P33950;locus_tag=PROKKA_00003;product=Heme-binding protein A precursor\n+NODE_10_length_13322_cov_17.9253_ID_19\tProdigal:2.60\tCDS\t3401\t4405\t.\t-\t0\tID=PROKKA_00004;gene=oppF;inference=ab initio prediction:Prodigal:2.60,similar to AA sequence:UniProtKB:P24137;locus_tag=PROKKA_00004;product=Oligopeptide transport ATP-binding protein OppF\n+NODE_10_length_13322_cov_17.9253_ID_19\tProdigal:2.60\tCDS\t4398\t5318\t.\t-\t0\tID=PROKKA_00005;gene=oppD;inference=ab initio prediction:Prodigal:2.60,similar to AA sequence:UniProtKB:P24136;locus_tag=PROKKA_00005;product=Oligopeptide transport ATP-binding protein OppD\n+NODE_10_length_13322_cov_17.9253_ID_19\tProdigal:2.60\tCDS\t5673\t7919\t.\t-\t0\tID=PROKKA_00006;eC_number=3.6.4.-;gene=priA;inference=ab initio prediction:Prodigal:2.60,similar to AA sequence:UniProtKB:P17888;locus_tag=PROKKA_00006;product=Primosomal protein N'\n+NODE_10_length_13322_cov_17.9253_ID_19\tProdigal:2.60\tCDS\t8063\t8455\t.\t+\t0\tID=PROKKA_00007;gene=iscR;inference=ab initio prediction:Prodigal:2.60,similar to AA sequence:UniProtKB:P0AGK8;locus_tag=PROKKA_00007;product=HTH-type transcriptional regulator IscR\n+NODE_10_length_13322_cov_17.9253_ID_19\tProdigal:2.60\tCDS\t8455\t9315\t.\t+\t0\tID=PROKKA_00008;eC_number=1.8.1.-;gene=asrC;inference=ab initio prediction:Prodigal:2.60,similar to AA sequence:UniProtKB:P0A1Y2;locus_tag=PROKKA_00008;product=Anaerobic sulfite reductase subunit C\n+NODE_10_length_13322_cov_17.9253_ID_19\tProdigal:2.60\tCDS\t9345\t10373\t.\t+\t0\tID=PROKKA_00009;inference=ab initio prediction:Prodigal:2.60;locus_tag=PROKKA_00009;product=hypothetical protein\n+NODE_10_length_13322_cov_17.9253_ID_19\tProdigal:2.60\tCDS\t10482\t11309\t.\t+\t0\tID=PROKKA_00010;inference=ab initio prediction:Prodigal:2.60,protein motif:CLUSTERS:PRK09894;locus_tag=PROKKA_00010;product=diguanylate cyclase\n+NODE_10_length_13322_cov_17.9253_ID_19\tProdigal:2.60\tCDS\t11401\t12543\t.\t+\t0\tID=PROKKA_00011;gene=mlaE;inference=ab initio prediction:Prodigal:2.60,similar to AA sequence:UniProtKB:P64606;locus_"..b'CCGACACGGTCCCTGCT\n+GGAAAAGACCATCGACATCGACGCCTTGAGGAACAGCACCACTGAAATTATCATTTCCGT\n+CCTGAATATGCGGACATCACAGGTTCGTTATTTCACCCACAAAGCGATCGGTATCGAGCA\n+TCTCATGGCGGCCGGCGGGATTCCCATGATGTTCCCGTGGCAGTACATAGACGGGGATCC\n+TTACTGGGATGCCGGCGTCATGGTCAATACGCCGATCATGCCCGCGTTTGAACGGGGAGC\n+GACGGAAATCATCGTGGTGCTGCTGTCGCCCCTCGGCGCCATTCCCCAGCGTCTGCCCAG\n+CACCCATCGGGAAGTGAGCGAGCTGGTGTTCGAACAGTTTTTGATCGGTTCGTATACCGC\n+CTGCCTGCCCAATGCCGGATGGCGGACGAATCCGGAGGCGGACGTCTACGATACGCCCCT\n+GCCGGACTCCCCCCAGCTTCAGCTTTCGATGAAAGGCGTCCGGATGGCGACCGTATATCC\n+GACCCGGATGCTTGGATTTCGCTCTTTGCTTGACTTTTCTCCCCGGCAGGCCAAAACCCT\n+GCTCCGGGATGGTTATGTCAATGCGCGCATGCAATTGAAGTCTTTTTTTAAATAAGACGG\n+GATATTGCAGAAACGAGCGCGACCGGATTGACATCAAAGTGGCATTAAAATCCGAAAATC\n+ATCAGCAAAAGGAGAAAAAAATGCCGACAGCAACGATCCGACAGCAACTGATCGAACTTT\n+TGAGCGAAAACAAATATGATGCCCGGGACCTGTCCCAGCGTCTGGGCGTCCGTGAAACCG\n+TCGTATACGACAGCATCCCCCATATCACCCGGTCCGTGACATCCATGGGCAAAAAACTGA\n+AAATCGTTCCATCACGCTGCACGTCGTGCGGATATACGTTCAAGGACCGCAAACGCGCGG\n+CAAAACCCAGCCGATGCCCGACCTGCAAGAGTGAGCGGATCGCAAAGCCGAAGTTTTATA\n+TCGTCTGACCGGAAAAGTCCCGTCAGCCGGTAACGACCGCTTCCAGTTCGGCAAGGGAGA\n+TGGAATCGACAAATGCGGCCCCGATGGCGGAAAGCAGGACGAAATGGATCTTTTCCCCCT\n+GGCGCTTTTTATCCCGGGCCATGGCGTCGATGATGCGCTCCGGATCGATGGGGAGCGCGG\n+TCGGCAGGTCCAGGTGCGTGAGCAGATCCTTTAACCGCCGGATGTCCGGGCGGGGGAGAT\n+GCCCCCGGTTTGCGGACAACTCCGCCGCCATGACCATTCCGGCGCTCACCGCCTCGCCGT\n+GCGGCACACCGAGGGTTTTCTCGATGGCATGGCCGAGGGTGTGCCCGAAATTCAGCTTTC\n+GGCGCTCTCCCCGCTCCTTTTCGTCGGCGTTGACCACCCCGGCCTTGATCACGACCGATT\n+TGCGGATGATCCGCTCCAGCGTTTCCGGGTCCCGGGCGCACGCCTTTTCGTGGCTTTCTT\n+CCAGATCGGCAAAGTAATCTTTGTCGGAAATGGCGGCGTGTTTGACGATTTCGGCAAAAC\n+CGCAGGCCAGCTCCCTGGGCGGAAGCGTACCGAGCACATACGGATCGCAGATGACGAACT\n+CGGGCTGGTTGAAGACTCCGACCATGTTCTTGTAGCCCATGAAATTCACGCCGTTTTTCC\n+CGCCCACACTGGCGTCCACCTGGGCCAGGAGCGTGGTGGCCACATAGCCGAACCGCACCC\n+CCCGCATGTAGGTCGAGGCGATAAAGCCGGTGATGTCGCAGACGATGCCGCCGCCGATGC\n+CGACGATAAACACCGACCGGTCCGCAGCCATTGAAACGAGCCGCTCATAAATCATCTTCG\n+CGGTATCGAGCGTCTTGATCGATTCGCCGCACCCGATGGTAATGACCTCGACGTCGGGAA\n+ACATTTTGCCGTAATGGCCGGCCACATTCGTGTCCGTGATCACCACGATCCGGCGATCCG\n+GAAGGTAGTTAGACAGGTTTTCCAGTCGCTCGCCGATGACGATTTTCGATGCACCGGTCT\n+GGCCGCTGATTGAAACGGTTTTCACAGGTATTCTCTCCAATAAGGAATATCAACGGGTTT\n+CATCGGTTTCGCCGAGTATCCGGAAAATGGACCGGACCTGGCGGCACAACGTTTCGAACT\n+GATCGGGGTACAACGACTGCGATCCGTCGCACTGGGCGGTATCCGGGGCATTATGGACTT\n+CGATCATCAATCCGTGGGCCCGTGCCGCCGCCGCCGCCCGGCTCAGGGGAATCACCTGGT\n+CCCGGAATCCGGTGGCATGACTGGGATCGACGATGATCGGCAGATGGCTTTCCCGCTGCA\n+CCACCGGAATGGCCGACAGGTCCAGCGTATTGCGACTGTGCCGGACAAACGTGCGGACCC\n+CCCGTTCACACAGGATGATGTTCGGATTCCCCTGCGACAATACATATTCCGCCGCCATCA\n+GCCATTCGTCGATGGTGGCGGACATCCCGCGCTTTAAAAGAATCGGCTTTTTCGATTCGC\n+CCGCCCGCCTGAGCAGGCTGAAATTCTGCATGTTGCGGGTACCGATCTGAACGATATCGG\n+CATACGCTTCGACCATGTCGAAATATTCCAGATCCATTACCTCGGTCACGATCGGCATCC\n+CGAACTGCTCCCGGGCCCTGGCAAGGATCTTGAGCCCTTCCTCGCCGAGTCCCTGGAACG\n+CATAGGGAGAGGTCCGGGGTTTGAAGGCCCCGCCCCGGAAAATGTCGGCGCCCGCTTTCT\n+TGACCCGTTCGGCGGTGGCCATGACCTGCGCTTCGCTTTCCACCGCGCACGGCCCGCCGA\n+TGATGACCAGATGACCGTTTCCGATTTCGACGCCACCGACCCGGATAATGGTGTCATGCG\n+GCTGAATCTCCCGGCTGACGAGTTTATAGGGACGTGTCACCGGAATCGTCTCCTTGACAC\n+CGGGCATATCCTGGAACCAGGCCGCATCGATCGCCGTGCGATTGTTTAAAATTCCGATCG\n+ATACCCGGTCCCCGCCCGGTATGGAGCGGGCCGTATATCCCCTGGCCTCGATCGCCCGGA\n+TCACAGCATCAATCTGTTCCCGGCTGGCATCCTGACGCATCACGATGAGCATGGTCTCTC\n+CTCCTGCAAAATCATTGTCGAATGGGTCATCCTTTAAAATAAAAAACCGTGGGGTCCTCG\n+CGGGGCCCCACGGTCTCAGATACTTTTCGATACCTCAAAAACGGATCAATCCGGTATCGG\n+CCGAAGGGAAAACCCCGCTGATACGGGCTTTCGCCATCCGCGCCAGAATCCGTAAATCAA\n+TACTGCAAATCCATATTGTGGCATTTTAAAAACCGCCCGATTGAACCGGTTTGAATCATT\n+TGGCAATTTTACATCAATTTTTATTTCAATAGGCCTTTATACGCGGCATCGTCAAGAAAA\n+ATATTGTTTACTCTGGCGATTCATGGCCGACACATCAGAAAAATATGCGACAAGGTACGA\n+AAATGATTCTGATTGACATTTTGCGGCGAAACTGGTATGACCTCTTACCATTGAATGAGA\n+TCGGCAATATCTGCGCGGTAAAATTTGGCAACAAGGCGGCGGATCCATGACGAATGCGAC\n+AAACGACCAGCCGTTTCGACCGGCCCGGTTCACCGAACAGCGGCTGATCACGGCCATTCT\n+GGACGGCACCTGCCCGCCGGGGTCCGTGCTGCCGGCTGAACGCCGGCTCGCCGAACAGTT\n+CGGGGTCACCCGCCCCACTATCCGGGAAACCCTGCAGCGGCTTGCGGCCGAAGGGTGGAT\n+CACGATCCGCCACGGGAAACCGACCCGGGTCAACGACTTCTGGGAAACCGGCGGGTGTTC\n+CTAGGCTGTTTCCTGGTGGGATCCTCTAGAGTCGACCTGCAGGCATGCAAGCTTGAGTAT\n+TCTATAGTCTC\n'
b
diff -r 000000000000 -r 68a3648c7d91 filter_Fb/filter.fb.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/filter_Fb/filter.fb.pl Thu Dec 22 04:45:31 2016 -0500
[
@@ -0,0 +1,22 @@
+#!/usr/bin/perl -w
+#
+$f=shift;
+$out=shift;
+open(IN,$f);
+open(OUT,">$out");
+ while(<IN>)
+ {
+ if ($_=~/^#/)
+ {
+ print OUT;
+ next;
+ }else{
+ $vl=(split())[5];
+ $gt=(split())[-1];
+ $gt=(split(/\:/,$gt))[0];
+ next unless $gt eq "1/1";
+ print OUT  if $vl>20;
+ }
+ }
+ close(OUT);
+
b
diff -r 000000000000 -r 68a3648c7d91 filter_Fb/filter.fb.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/filter_Fb/filter.fb.xml Thu Dec 22 04:45:31 2016 -0500
b
@@ -0,0 +1,13 @@
+<tool id="filter_FB" name="filter_FB" version="0.">
+ <description>filter low quality variants from vcf file"</description>
+  <command> /home/inmare/galaxy/tools/filter_Fb/filter.fb.pl $f1 $o1 </command>
+   <inputs>
+    <param name="f1" type="data" format="vcf" label="unfiltered vcf" help="vcf only"/>
+   </inputs>
+   <outputs>
+    <data name="o1" type="data" format="vcf" label="filtered vcf"/>
+   </outputs>
+   
+   <test/>
+   <help/>
+</tool>
b
diff -r 000000000000 -r 68a3648c7d91 fosm_cluster/4mers.list
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fosm_cluster/4mers.list Thu Dec 22 04:45:31 2016 -0500
b
@@ -0,0 +1,256 @@
+AAAA
+AAAC
+AAAG
+AAAT
+AACA
+AACC
+AACG
+AACT
+AAGA
+AAGC
+AAGG
+AAGT
+AATA
+AATC
+AATG
+AATT
+ACAA
+ACAC
+ACAG
+ACAT
+ACCA
+ACCC
+ACCG
+ACCT
+ACGA
+ACGC
+ACGG
+ACGT
+ACTA
+ACTC
+ACTG
+ACTT
+AGAA
+AGAC
+AGAG
+AGAT
+AGCA
+AGCC
+AGCG
+AGCT
+AGGA
+AGGC
+AGGG
+AGGT
+AGTA
+AGTC
+AGTG
+AGTT
+ATAA
+ATAC
+ATAG
+ATAT
+ATCA
+ATCC
+ATCG
+ATCT
+ATGA
+ATGC
+ATGG
+ATGT
+ATTA
+ATTC
+ATTG
+ATTT
+CAAA
+CAAC
+CAAG
+CAAT
+CACA
+CACC
+CACG
+CACT
+CAGA
+CAGC
+CAGG
+CAGT
+CATA
+CATC
+CATG
+CATT
+CCAA
+CCAC
+CCAG
+CCAT
+CCCA
+CCCC
+CCCG
+CCCT
+CCGA
+CCGC
+CCGG
+CCGT
+CCTA
+CCTC
+CCTG
+CCTT
+CGAA
+CGAC
+CGAG
+CGAT
+CGCA
+CGCC
+CGCG
+CGCT
+CGGA
+CGGC
+CGGG
+CGGT
+CGTA
+CGTC
+CGTG
+CGTT
+CTAA
+CTAC
+CTAG
+CTAT
+CTCA
+CTCC
+CTCG
+CTCT
+CTGA
+CTGC
+CTGG
+CTGT
+CTTA
+CTTC
+CTTG
+CTTT
+GAAA
+GAAC
+GAAG
+GAAT
+GACA
+GACC
+GACG
+GACT
+GAGA
+GAGC
+GAGG
+GAGT
+GATA
+GATC
+GATG
+GATT
+GCAA
+GCAC
+GCAG
+GCAT
+GCCA
+GCCC
+GCCG
+GCCT
+GCGA
+GCGC
+GCGG
+GCGT
+GCTA
+GCTC
+GCTG
+GCTT
+GGAA
+GGAC
+GGAG
+GGAT
+GGCA
+GGCC
+GGCG
+GGCT
+GGGA
+GGGC
+GGGG
+GGGT
+GGTA
+GGTC
+GGTG
+GGTT
+GTAA
+GTAC
+GTAG
+GTAT
+GTCA
+GTCC
+GTCG
+GTCT
+GTGA
+GTGC
+GTGG
+GTGT
+GTTA
+GTTC
+GTTG
+GTTT
+TAAA
+TAAC
+TAAG
+TAAT
+TACA
+TACC
+TACG
+TACT
+TAGA
+TAGC
+TAGG
+TAGT
+TATA
+TATC
+TATG
+TATT
+TCAA
+TCAC
+TCAG
+TCAT
+TCCA
+TCCC
+TCCG
+TCCT
+TCGA
+TCGC
+TCGG
+TCGT
+TCTA
+TCTC
+TCTG
+TCTT
+TGAA
+TGAC
+TGAG
+TGAT
+TGCA
+TGCC
+TGCG
+TGCT
+TGGA
+TGGC
+TGGG
+TGGT
+TGTA
+TGTC
+TGTG
+TGTT
+TTAA
+TTAC
+TTAG
+TTAT
+TTCA
+TTCC
+TTCG
+TTCT
+TTGA
+TTGC
+TTGG
+TTGT
+TTTA
+TTTC
+TTTG
+TTTT
b
diff -r 000000000000 -r 68a3648c7d91 fosm_cluster/clusterF.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fosm_cluster/clusterF.xml Thu Dec 22 04:45:31 2016 -0500
b
@@ -0,0 +1,19 @@
+<tool id="cluster" name="FosBin" version="0.">
+ <command> /home/inmare/galaxy/tools/fosm_cluster $f1 $l $o1 $o2 </command>
+ <description> k-means clustering of assembled fosmids.</description>
+ <help>The tool was designed to tentatively assign contigs from incomplete fosmid assemblies to clusters, ideally corresponding to single fosmids. Clustering is performed based on tetra-nucleotide frequencies of the contigs and coverage. The current version is only compatible with SPAdes output as coverage is recovered from the fasta headers. Future version migth require a different set of input files. Full details are in Chiara et al. #paper id. Clustering of contigs is performed by a custom script based on the R implementation of the K-means algorithm, using 1500 starting positions for the centroids. The clustering is performed on metrics based on coverage, GC composition and tetra-nucleotide composition of each contig, which are computed directly from the fasta file. The user must input the desired number of clusters, contigs are partitioned accordingly." </help>
+ <inputs>
+  <param name="f1" type="data" format="fasta" label="fasta file with contigs" help="currently need to be in SPAdes format"/>
+  <param name="l" type="integer" label="number of clusters" value="5"  help="should correspond to the number of fosmids"/>
+ </inputs>
+ <outputs>
+  <data name="o1" ftype="tabular" format="txt" label="fosmids to cluster table"/>
+ <data name="o2" ftype="fasta" format="fasta" label="modified fasta file, containing cluster identifiers in the header"/>
+ </outputs>
+  <test>
+        <param  name="f1" value="sim1_galaxy.fasta"/>
+        <param  name="l" value="9" />
+        <o1 name="outfile1" value="res"/>
+ <o2 name="outfile2" value="fasta.fas"/>
+ </test>
+</tool>
b
diff -r 000000000000 -r 68a3648c7d91 fosm_cluster/compute.stats.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fosm_cluster/compute.stats.pl Thu Dec 22 04:45:31 2016 -0500
[
@@ -0,0 +1,81 @@
+#!/usr/bin/perl -w
+$file=shift;
+$ncluster=shift;
+$ofile=shift;
+$fasfile=shift;
+$flist="4mers.list";
+open(IN,$flist);
+while(<IN>)
+{
+        chomp;
+        push (@let,$_);
+}
+close(IN);
+open(OUT,">$file.stats");
+open(IN,$file);
+%c=();
+$name=(split(/\./,$file))[0];
+while(<IN>)
+{
+ chomp;
+ if ($_=~/^>(.*)/)
+ {
+ $id=$1;
+ $M{$id}=$name;
+ }else{
+ $c{$id}.=$_;
+ }
+}
+close(IN);
+foreach $s (keys %c)
+{
+ ($len,$cov)=(split(/\_/,$s))[3,5];
+ $seq=$c{$s};
+        $at=0;
+        $gc=0;
+        $le=length($seq);
+        $tt=0;
+        %DD=();
+ @seq=(split('',$c{$s}));
+
+        foreach $l (@seq)
+        {
+                if ($l eq "A" || $l eq "T")
+                {
+                        $at++;
+                }else{
+                        $gc++
+                }
+        }
+        for ($i=0;$i<=length($seq)-4;$i++)
+        {
+                $subs=substr($seq,$i,4);
+                $rc=reverse($subs);
+                $rc=~tr/ACGT/TGCA/;
+                $tt+=2;
+                $DD{$subs}++;
+                $DD{$rc}++;
+        }
+        $gc=$gc/$le;
+        next unless $len > 1000;
+        print OUT "$s\t$gc\t$cov\t";
+ foreach $L (sort @let)
+        {
+                $val=$DD{$L} ? $DD{$L}/$tt : 0;
+                print OUT "$val\t";
+        }
+        print OUT "$M{$s}\n";
+}
+
+system("./kmeans.R $file.stats $ncluster $ofile ")==0||die("no kmeans");
+
+open(OF,$ofile);
+$l=<OF>;
+open(FAS,">$fasfile");
+while(<OF>)
+{
+ ($id,$cluster)=(split())[0,1];
+ $NC{$cluster}++;
+ $id=~s/\"//g;
+ print FAS ">$cluster\_$NC{$cluster}\#$id\n$c{$id}\n";
+}
b
diff -r 000000000000 -r 68a3648c7d91 fosm_cluster/kmeans.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fosm_cluster/kmeans.R Thu Dec 22 04:45:31 2016 -0500
[
@@ -0,0 +1,11 @@
+#!/usr/bin/env Rscript
+args = commandArgs(trailingOnly=TRUE);
+data<-read.table(args[1],sep="\t",row.names=1);
+data<-data[,1:258];
+data[,2]<-data[,2]/sum(data[,2])
+file<-(args[3])
+num=as.integer(args[2])
+K<-kmeans(data,num,nstart=1500)
+clust=K$cluster
+
+write.table(clust,file);
b
diff -r 000000000000 -r 68a3648c7d91 fosm_cluster/res.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fosm_cluster/res.fasta Thu Dec 22 04:45:31 2016 -0500
b
b'@@ -0,0 +1,38 @@\n+>9_1#NODE_2_length_40000_cov_63.1617_ID_3\n+CCAGCGGCCAGGTCGATCGACAGCACGCACTGCCCGGTGTAGGGCAACAGGCGCTCGCGGTCGTCCAGGCTGCCCGCGCAGGGCTTGACCACCATGACATCGTTGGCACCGGTTTCCAGCAGATGGTCGATCACGCCGAGCAACTGCCCGCCCTGGTCGATCACCTTCAGGCCTTCCAGCTGGTGCCAGTAGTACTCACCTTCCTCGAGAGAGGGCAACTCGCTACGCGGGATGCAGATCTCGTAACCGGTGAAGGTGCGGGCCTCTTCGCGATCGTCGAGCCCCTTGAGCTTGGCGGCCAGGACCTTGCCATGCAGGCGCCCCCTGACCAGCTCGGCCTGCCGAATCTCGCCGTCGCGCCGGAGCGTCCAGCGGCGATAGTCCAGCAGGTTGTCCAACGGGTCGGTAAAGGAATACACCTTCACCTCACCGCGGATGCCGTACACCGAAACGATCTTGCCGATCACGACCAGGTCGTCGGCGGGTGTCGGCATTTCACTCATGACGGCTTAGGCGTTTGCCTTGGCAGCGTCCTTGAGCAGCTGAGCAACGCGCTCAGACGGCTGTGCACCCTGGCCGAGCCAGTAGGTAGCACGCTCCTGGTCGACGGACAGACGCACTTCGCCACCAGTCGCAACCGGGTTGAAGAAACCGATGCGCTCGACGAAGCGACCATCGCGCGCATTGCGGCTGTTGGTCACGGTCAGGTGGTAGAAGGGGCGCTTTTTGGAGCCGCCACGAGCAAGACGGATGGTTACCATGTGAACTTCGTTCCTGTAGTCGGTGCTTGCAAAATGAATGCACGCTGGGCCCACGGCCCGAAAGGCCGCATATTCTAAGGATTATCGGGGAATTTGCAAATCTCTTTTTCGGCCACCCATCGGCCGGCGCGGAGAGCGCGGGAAGCACACGGTCCGCCCGCCGGCCGACGGCGCGAAACCGCCGGCGTGGCCAGCGGCCGCGCGTCCGATCAAAGCTTGGGCATGCCCCCCGGGAACATACTGCCCATGCCTCGCATCATCTTGGCCATGCCGCCCTTGGCGGTGACCTTCTTCATCATCTTCTGCATCTGCTTGTGCTGCTTGATCAGCCGGCCGACGTCCTGCACCTGGGTACCGGAGCCAAGGGCGATGCGGCGCTTGCGCGAGCCGCTGATCATTTCCGGATCGCGCCGTTCGCCGGGCGTCATCGAGTTGATGATCGCCTCCATCTGCTTGAACTGTTTCTCCGCAGCACCCTGGGCATTGCCCATCTGCGCCAGGTTGACCCCGCCGAGCATCGGCAGCTTGTCCATGAGCCCGCCGAGGCCGCCCATGTTCTTCATCTGTTGCAACTGGTCGCGGAAGTCTTCCAGGTCGAAGCCCTTGCCCTTCTTGATCTTCTTCGCCAGCTTCTCGGCCTTGTCGCGGTCGAGGTTCTGCTCGGCCTGTTCGATCAGGCTGAGCACGTCGCCCATGCCGAGGATGCGCGAGGCCACGCGGTCGGGATGGAACGGATCGAGCGCTTCGCTCTTTTCGCCCATGCCGAGGAACTTGATCGGCTTGCCGGTGATCGCCCGCACCGAGAGCGCGGCACCACCACGCGCGTCGCCGTCGACCTTGGTCAGGACCACGCCGGTCAGCGGCAAGGCGTCATTGAAGGCCTTGGCGGTGTTGGCGGCGTCCTGGCCGGTCATGGCATCGACCACGAACAGGGTTTCCGCCGGCTTGATCGCCGCGTGCACCTGCTTGATCTCGTCCATCATGTCGGCATCGATGTGCAGGCGGCCTGCGGTATCGACGATCACCACGTCGATGAACTTCAGTCTCGCCTCGCGGATCGCCGCCTCGGCGATGGCCACCGGCTTCTGGCTGACGTCGGAAGGGAAGAAGGTTACGCCAACCTCGCCCGCCAGGGTTTCCAACTGCTTGATCGCGGCGGGTCGGTAGACGTCGGCGGAAACCACCATCACCGACTTCTTCTTGCGTTCCTTAAGGAAGCGCGCCAGCTTGCCCGCGGTGGTGGTCTTGCCCGCGCCCTGCAGGCCGGCCATCAGGATCACCGCCGGCGGCGCGACGCTCAGCGCCAGGTCCTCGTTGGCCGCCCCCATCAGCTCCTCGAGCTCGGCGCGGACGATCTTCACGAACGCCTGTCCCGGGGTCAGGCTCTTCGAGACCTCGGTACCGACGGCGCGCTCCTTGACCTTGTTGACGAAGTCCTTGACCACCGGCAGGGCCACGTCGGCCTCGAGCAGGGCCATGCGCACTTCGCGCAGAGTGTCCTTGATGTTGTCCTCGGTCAGCTTGGCCTTGCCGGTGACATGGCGAAGCGTCTGCGAGAGGCGGTCTGTAAGGTTTTCGAACATGCGCGATCCTTCCACGGGGGTTGCGGCAAGCGGCGGATTATAACCAAGAGCGCGTCGCGGGACACGCGATGAAAGACGGAGTGTGTGTCCGGCGACGACCGGAGGCCAGCCTCAAGTCCTTGATTTGCCGATGCGTTCCGTCTTTGCCGAAGCGACCGCCGGACCATGAGGTCGCGGCGCCCGCGGCAGGCGGTCCGCCAACCCGGCCACAGGTACGAGAGAACTGTTCGCAACGGCTTTCTCGCGACGCTCGATCTATGCCAAACTCCCATCCTTTCGGGCCCGCGATTACAAGGACTTATGCATCCCCTGCTGCCCAGCCTCATCGCTGCTGTTCTCTATCTCGGCACCGCCGCCTACCAGGGTGCCTGTGTGTCCAAGCGCACCGCGCCGGGCAAACCCCTGCTGCTCCTGTTCGGACTCCTGGCACTGGTCGCCCACGCCTTCAGCCTCTACCAGCAATTGCTGACCCCGGCCGGCCTGGTGCTGGACTTCTTCAACGCCGCCAGCCTGATCGCCGCCGCGGTAATTCTCCTGACCCTGCTGGCGACCTTGCGGATACCGGTGGAGAACCTGCTCCTCCTTCTATTCCCGCTGGGTGCCCTCACCACCCTGCTGGCGGTACTCATCCCCCACGGCACCGTCGAGCCGATCAACGAACAGCCAGGCATTCTGGCCCACATCCTGCTCTCGATCCTGGCCTACGGCCTGCTCACCATCGCGGTGTTCCAGGCGCTCTTGCTGCTGCTCCAGGACTACCGCCTCAAGCACAAGCATCCGTCCGGGCTGATCCGCAACTTCCCGCCCCTGCAAACCATGGAAAGCCTGCTGTTCGGCTTCCTCTGGGGCGGCTGGTCGCTGCTTTCGCTGTCGCTGCTGTCCGGATGGCTGTTCGTCGACAACCTGTTCGCCCAGCACCTGGCGCACAAGACCATCCTGTCCTGCTTCGCCTGGGTGGTCTTCGCCGTGCTCCTCTGGGGTCGCCACCAGCTCGGCTGGCGGGGCCACAAGGCGATCCGCTGGACCCTGGCCGGTTTCTGCCTGTTGATGTTGGCGTACTTCGGCAGCAAGCTGGTGCGGGAATTCATCCTGCACATCTGATGGGCCTCCTTCATGGATGAGCTGCACCCCGGGTACCTGGTCGGCCTGCTGGTTCTCCTGGTCGCCTGCTCGGCGTTCTTCTCCTGCGTCGAGACCGCCCTGCTCAACCTCGACCGTTACCGCCTGCGCCTGCAGGCCAAGCAAGGCCTGCGCGGGGCCCGGCGCAGCAGTTGGCTGCTGCTGCACGACGACCGCCTGCGCGGCACCCTGTTGTTCGGCCGCACCCTGGTCAACGTCAGCGCCGCCGCCCTGGCCAGTTGGGCCGCGCTGCGCCACTGGGGCGTCATCGGCCTGGCCGTCGCCATCCCCGGCATGACCCTGCTCCTGCTGCTGTTCGGCGCCCTGCTGCCGCGCCGCTACGGCGCCCTTCGCTCGGAACGCGTCGCCCTGCCGCTCAGCCTGCCGCTCCTGATCCTGCAGCGCCTGTGCTGGCCCCTGCTGTGGCTGCTGACCCTGCTGAGCAACGCGCTGCTGCGCCTGCTCGGCGTCGCCG'..b'CAGCATTACGCGGTGGGCTACCGCCTGGACGGCAAGTGCCAGTTCGCCACCCGCGTGGCCCAGCGCTGCCACGACATCGTCCAGCACCCGAGCCTGCCGCTGGCGCTGTTCGTCGCCCGCCGGCCAGGCACCGAAAGCTACCTGATCGACCTGAACGACGGTCGCCTGCTGCAGACCCTGGTCTCGCAGAAAGACCGCCACTTCTACGGCCACGGCGTGTTCCACCAGAGCGGCGAGTGGCTCTACGCCACCGAGAACGACACTACCGATCCCGGTCGCGGCGTGCTCGGCGTCTACCGTTTCGACGGCGAGCGACTGCAGCACAGCGGCGAGATCTCCACCCACGGCCTCGGGCCGCACCAGGTTTCCTGGATGCCCGACGGCGAGACCCTGGTAGTGGCCAATGGCGGCATTCGCACCGAGGCGGAAAGCCGGGTCGAGATGAACCTCGACGCCATGGAGCCCAGCCTGGTGCTGATGCGCCGCGACGGCAACCTGCTGTCCAAGGAAACCCTGGCGCAGCAGATGAACAGCGTCCGCCACCTGGCGATCGGTCGCGACGGCACCATCGTCGCCGGCCAGCAGTTCATGGGCGATGCCCACGAACATGCCGACCTGCTGGCGATCAAGCGCCCCGGCCGTCCCTTCGAAGCCTTCCCGGTGGCCGAGGAGCAGCGCCTGGCGATGGCCCAGTACACCGCCAGCGTGGCGATCCACGACGACCTGCGCCTGGTGGCCCTGACCGCCCCGCGCGGCAACCGTTTCTTCATCTGGGACCTGGACAGCGGTGCCGTACGCCTCGACGCGCCGTTGCCGGACTGCGCCGGCGTGGGCGCGGTGAAGAACGGCTTCGTCGTCACGTCCGGACAGGGCCGTTGCCGCTTCTACGACTGCCAGGGCGAGCGCATCGCCGCGCAACCGCTGGAGCTGCCTGCCGGCCTCTGGGACAACCACCTGCACCTGGCCTGAGAGCACGGCATCAACCTCTCGCCATACCGGTCCGCGAAAGTCGGAAGCGACCTGCAACACGCTTGAAATCTTTCGCCAATCCTGTGCTTTGACAAGCCCTGAAAGCTGCACGTAATGTGTCGGATTCGCCCCGAAACGGGGCGCTCTCGGCATACCAGGGACCAGGATTATGTTGCTCCGCCGCATGTTGATCATGCTCGCCGCGGTGATCGCCGTGGTGGCGATTCTCGCCGGCTACAAGGTCTACTCCATCCGTCAGCAGATCGCCCTTTTCAGCGCACCGAAACCGCCGATCAGCGTGACCGCCAGCCTGGCCGAAAAGCGTCCCTGGCAGAGCCGCCTGCCAGCCATCGGCAGCCTCAAGGCATTCCAGGGCGTGACCCTCACCGCCGAAGTCTCCGGCACGGTACGCGACGTACTGTTCCTTTCCGGCGACCAGGTGAAGCTGGACCAACCGCTGATCCAGTTGGAAAGCGACGTCGAGGAAGCCACCCTGCGCACTGCCGAGGCCGATCTCGGCCTGGCCAGGGCCGAGTACCAGCGCGGCCGCGAACTGATCGGCAGCAAGGCCATCTCGAAAAGCGAATTCGATCGTCTCGCCGCGCAGTGGGCCAAGACCAGCGCCACCGTCGCCGAGCTGAAGGCGGCGCTGGCGAAGAAGCGCGTGCTCGCGCCCTTCGCCGGGACCATCGGCATCCGCCAGGTGGACGTCGGCGACTACGTCTCGCCCGGGACGCCGATCGCCACCTTGCAGGACCTTTCCACCCTGCTCCTGGATTTCCACCTGCCCGAGCAGGACTTCCCCCTGCTCAGCCGCGGCCAGCTGGTGAAGGTCCGGGTCGCCGCCTACCCCGGCCAGGTGTTCGACGCCGAGATCGCCGCCATCAACCCCAAGGTCGACAACGAGACCCGCAACCTGCAGGTCCGCGCTGCCCTGGAGAACCCGGACGGCAAGCTGCTGCCGGGCATGTTCGCCAACCTCGAGGTGATGTTGCCTGGCGAGGAACAACGCGTCGTGGTGCCGGAGACGGCGATCACCTTCACCCTCTACGGCGACTCGATCTACGTCGTCGGGCAGAAGAAGGACGAGCAGGGCCAGGTGTCGAAGGATGACAAGGGCCAGCCGCAACGGGTCGTCGAGCGCCGCTTCGTCAGGATCGGCGAACGCCGCGAAGGCCTGGCGGTGGTGCTCGAAGGCCTGGAGGGCGGCGAGCAGGTAGTGACTTCCGGGCAACTGAAGCTCGACAACGGCGCCGCGGTGGCCATCGTCGCCGAGCGGGACCTCCAGCAAGAGCACTGAGTCGCGCGCCTTCCCACTCCGTGGCGGAAGGCTTGCCAAGGGACTGAAACATGGCTTTTACCGATCCGTTCATCCGTCGTCCGGTCCTGGCGAGCGTGGTCAGCCTGCTGATCGTCCTGCTCGGCATGCAGGCCTTCAGCAAGCTGGTGATCCGCGAGTATCCGCAAATGGAGAACGCGCTGATCACGGTGACCACGCTCTACGCCGGCGCCAACGCGGAAACCATCCAGGGCTACATCACCCAGCCGCTGCAGCAGAGCCTGGCCAGCGCCGAAGGCATCGACTACATGACCTCGGTGAGCCGGCAGAACTATTCGACCATCTCCATCTACGCGCGGATCGGCGCCAATACCGATCGCCTGGTCACCGAGCTGCTGGCCAAGTCCAACGAAGTGAAGAGCCAGCTGCCGCCGGACGCCGAGGACCCGGTGCTGCAGAAGGAGGCCGCGGACGCCTCGGCGCTGATGTACATCAGCTTCTACAGCGAGCAGATGAACAACCCGCAGATCACCGACTACCTGTCGCGGGTGATCCAGCCCAAGCTGGCGACCCTGCCCGGTATCGCCGAGGCGGAGATCCTCGGCAACCAGGTGTTCGCCATGCGCCTGTGGCTGGACCCGGTGAAGATGGCCGCGTTCGGCGTCACCGCCGGCGAGATCAACCAGGCGGTGCAGCAGTACAACTTCCTCGCCGCCGCCGGCGAGGTGAAGGGCCAGTTGGTGGTCACCAGCGTCAATGCTTCCACCGACCTCAAGTCGCCCCAGGCCTTCGCCGCCATCCCGGTGAAGACCGACGGCGACCGCCGGGTGCTGATGGGTGATGTCGCACGGGTCGAGCTGGGCGCCGCCAGCTACGACGCGATCAGTTCGTTCAATGGGATTCCCTCGGTCTACATCGGCATCAAGGGCACGCCCAGCGCCAACCCGCTGGACGTGATCAAGGAAGTGCGGGCGAAGATGCCCGAACTGGAAGAGCAATTGCCGCCCAACCTCAAGGTGTCCATCGCCTACGACGCCACGCGCTTCATTCAGGCCTCCATCGATGAAGTGGTGAAGACCCTCGGCGAGGCGGTGCTGATCGTCATCGTGGTGGTGTTCCTGTTCCTCGGCGCGTTCCGTTCGGTACTGATCCCGGTGGTGACCATTCCGCTGTCGATGATCGGCGTATTGTTCTTCATGCAGGCCATGGGCTACTCGATCAACCTGCTGACCCTGCTGGCGATGGTCCTGGCCATCGGGCTGGTGGTGGACGACGCGATCGTGGTGGTGGAAAACATCCACCGCCACATCGAGGAGGGCAAGCCGCCCTTCGAGGCCGCCCTGGAGGGCGCGCGGGAGATCGCCGTACCGGTGGTCAGCATGACCATCACCCTCGCCGCGGTCTACGCGCCGATCGGTTTCCTCACCGGCCTCACCGGCGCCCTGTTCAAGGAGTTCGCCTTCACCCTGGCCGGCGCGGTGATCATTTCCGGGATCGTCGCCCTGACCCTGTCGCCGATGATGTGCTCGCGCCTGCTGCGCCACGAGGAGAATCCCTCGGGCCTGGCGCATCGCCTCGACCTGATCTTCGAGGGCCTGAAGCAACGCTACCAGCGCGCCCTCCACGGCACCCTGGACACCCGTCCGGTGGTCCTGGTGTTCGCCGTGCTGGTACTGGCGCTGATCCCGGTACTGCTGATGTTCACCAAGAAGGAGCTGGCGCCGGAAGAGGACCAGGGCATCGTGTTCCT\n'
b
diff -r 000000000000 -r 68a3648c7d91 fosm_cluster/sim1_galaxy.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fosm_cluster/sim1_galaxy.fasta Thu Dec 22 04:45:31 2016 -0500
b
b'@@ -0,0 +1,12526 @@\n+>NODE_19_length_33054_cov_76.4496_ID_37\n+GGCCTGTCCGGCTGACGTGCTGGCTGCGCGTGCTGAGTCCGCAGCATCAGCCGCATGGGT\n+TGCCGCCTCACGGGCTGATGTGCTGGCATCACTGGCTGACTTCTTCGCGGCTGCCGTGTT\n+CTGTGCCACCGCGGACGCGTTACGCGCCACCTCTTCCACCATCAGTTCAAAACGGCGCAG\n+TGCCTCCGGACGGGCATCATCCTCCGTCATGGCACCGAGAAAATCATTCAGCGTACCGGG\n+TTGAGAATCTTCATACACGGTAATGGTCCCGGCATGTGACGGCGGGAATCCTTCCACCAA\n+CAGAATAACGCTGTACTGACCGTACTCAACGTCCATGCTGTAACGCCCGGCTTCATCCGG\n+ATTTTCTGAGGCCAGCGTGTTCACCACCACCGTGGTGCTGTTACGTTTTGCTTTCAGCTG\n+GATTGTGCAGTTCTGTACCGGTTTTCCTGTGCCGTCTTTCAGTACACCTGAAATCTTTAC\n+TGCCATATTCACCCCACAAAAAAGCCCGCCTGAACCGGCGGGCTGTCATAACACTGTGTT\n+ACCTGGCTAATCAGAACTTATAACCGACACCCACGATGAAACCGTCAGTGCGCCAGTCGC\n+CACTGCCGGGGCCTTCATAAGCAATATCAATGGCCACGGATTCGGTCGGGTTAAACTGCA\n+CGCCAGCTCCCCACGCCAGAGACGTGTTGCTGTGGCGACCGTCATCACTTCCGGTCAGCA\n+CGTCGTGCGTTTTCCCCTTGGGAAGGTGCGAACAAGTTCCTGATATGAGATCATCATATT\n+CATCCGGAGCGCATCCCAGAGGGACATCATGAGCCATCAACTCACCTTCGCCGATAGTGA\n+ATTCAGCACTAAGCGCCGTCAGACCCGAAAAGAGATTTTCCTCTCCCGCATGGAGCAGAT\n+TCTGCCATGGCAGAATATGACCGCTGTCATCGAGCCGTTTTATCCCAAGGCGGGCAATGG\n+CCGACGGCCCTATCCGCTGGAGACCATGCTGCGTATTCACTGCATGCAGCATTGGTACAA\n+CCTGAGCGACGGTGCCATGGAAGATGCCCTGTACGAAATCGCCTCCATGCGCCTGTTTGC\n+CCGATTATCCCTGGATAGCGCCCTGCCGGATCGCACCACCATCATGAATTTCCGCCACCT\n+GCTCGAGCAGCATCAACTGGCCCGTCAATTGTTCAAGACCATCAATCGCTGGCTGGCCGA\n+AGCAGGCGTCATGATGACCCAAGGCACTTTGGTGGATGCCACCATCATTGAGGCACCCAG\n+CTCTACCAAGAACAAAGAGCAGCAACGCGATCCGGAGATGCATCAGACCAAGAAAGGCAA\n+TCAGTGGCACTTTGGCATGAAGGCCCACATTGGTGTCGATGCCAAGAGTGGCCTGACCCA\n+CAGCCTGGTCACCACCGCGGCCAACGAGCATGACCTCAATCAGCTGGGTAATCTGCTTCA\n+TGGAGAGGAGCAATTTGTCTCAGCCGATGCCGGCTACCAAGGAGCGCCACAGCGCGAGGA\n+GCTGGCCGAGGTGGATGTGGACTGGCTGATCGCCGAGCGTCCCGGCAAGGTAAAAACCTT\n+GAAGCAGAATCCGCGCAAGAACAAAACGGCCATCAACATCGAATACATGAAAGCCAGCAT\n+CCGTGCCAGGGTGGAGCACCCGTTTCGCATCATCAAGCGGCAGTTCGGCTTCGTGAAAGC\n+CAGATACAAGGGGCTGCTGAAAAACGATAACCAACTGGCGATGTTATTCACCCTGGCCAA\n+CCTGTTTCGGGTGGACCAAATGATACGTCAGTGGGAGAGATCTCAGTAAAAACCGGAAAT\n+AACGCCAGAAATGGTGGAAAAAATAGCCTAAATAGGCTGATTCGATGTGTTTGCGGGAAA\n+AAAATCGGCCCAGATCCGCGAAATTTTAATCAGCGAGTCAGCTTGGGAAGAAATGACCTG\n+CTTATTCGCACCTTCCCTTGTTGTCAGTTACGCGGAGATAATCCCCGGAGAAAGTCGACA\n+CACGGCTGTAAGCCATACCCGCCATCGCATACGCGCTGAACCATTCATTCACGCGCACAG\n+ACGGCCCCACCATCACGCTGAACCAGCGGTTACGCACGGAATCTTCATGCCAGCGGGTAT\n+CGCTGTAACGGGTAAGCTGGCGATTCTTGTCTCCTGCATAGCTGAATGGCCGCCCTGCTG\n+CATTAACACCATAAATGGCGACTGACCAGTGGACAACCCAACAACAATATCCGTCATTTG\n+TGCAGGCAACATGCGCATAGCAAAAGCCGTTTGTTTTGCCGACATTCCGGTTTTGCTTAA\n+TTGTGATTGAGTAACCTCAAGCTCACTCCGCATAGCACGAAGTTTTCCAGAAAGCTCCTC\n+ATACATTTCAGGAGAAAGCATCCCCTTAGCTTTTGCTTCATTGAGCTGTTTCTGTTGTTC\n+TACCAGACGATTAAAAGCAGTTCCGACAGGATCAAGTTGAGCAATCAGACGTTGCAAAGC\n+AACAACCTGTTCATCATGCGCTTTTGCTGCTTCTCGCTCTGCCTGAGCCTCTCCGGTAAG\n+CTCTCGCCGTGTTTCCTGTATTTTTCGGCTATAATTCTCAAACTGAGAACCATTTATTTT\n+CCCGGATGCAAACGCAGCATTAAGTTCATCATGCTGTTGTTCAAGATTTCTTAGCGCCGC\n+AGCCAGAGGGTCGATCTTGTCCAGCATTCTTTGAAAGGCCTGAGCCTGCGCTTCCTGCTG\n+AGCGGCAGCAAGTTTTCCGGCCTTCTCGGCTTCTCTCTGCGCTTGCGCAACCCCGCTCAA\n+TTCCTCTGTGGTTTCATTAAGTTTACGGACAAGAAATTCATATTCTTCTTTATCAATAAG\n+CCCTTTATCGAAAAATTTCTTTAATTCAGAATAGCGTCGACCGACAGTATCAATTGCGGC\n+ACCAACTGGATCAATAGCTGCTTTTAATTTTGCGAGCGCGTTCTTCTCATCTTCTGTTGC\n+CTTAGTCACTTTCCCTGCGCTATTTGCAGCAGTTTCCCCAGCCTGCGTCATTTTGACTAA\n+TGAGGAGGTCAGATTGTCAGCATTATTTTTCGCTCCAGTGCTATCAATAATTATTGCGAG\n+ACGCGAGGTTTGCTCTGCCATTTATTAAAACTCCTGACAACAAAAAACCCACCGCGAAGT\n+GGGTTTCAGGCGACATAATAGTAGATATAGCGATTACGAGGCCACGCAATGCTTTTCTCC\n+AGGAGCATCATCGATTTAATTAAAGACACCATCACATCTCTGTAACAGAGTGTACGTAAT\n+TAACAACTACACACACTGCTCCTGAAAATACTGGTCATCCAGTGCAAAGATCACTGCTTC\n+AAATTCATCGCGCTCAATCAATACCGGATGAGTGGCTAAATATTCATTTATCTCTGTCAG\n+AGATAAAGGCAAAGGCACCCCAGCCATTCCAGCATAACGTCGGGCACGGGATATTACCGA\n+ATAGGCGTACAACAACTCCTTAAGCACCGGGTCTATTTCTGGTTCCGGTATCGGTGGCAA\n+CCTGAGTTTTTCTCGCTTCCATCTTGCCTTTTCCCCCCTTTCTCCCCCGAACTCCGATAA\n+CCACCGCTGGGCAGCTATGGCTTTTTTATCGTATCCTGCTTCTGCTGCTCTTTACCCTGG\n+GCGATGCTGGCTGCTTCTGCAAGGATCTGCCAGTACAACTCTGGATTCTGCTTAAGCAGC\n+GCGATCCCTCGTTCTGCCGTATATTCCAGTGCAACCTCAACACCATTAACCAGTTCACCA\n+ACCCCTTTCCAGTCTTTCAGCAGATAACGAGCGGCATTATCAATGAGTAAATCATCAACA\n+GAATCCACCTCGGAAACCTTTGAAATATCAA'..b'TTGCTATTGATACGGTGAACCATGTCATAAGTGAGTGATAAATTTATTAA\n+GGCCAAATAATGAAAATGAATTAGAATAAAGGAACTTACCATATAAGAAAGTAATATGGT\n+AAGTTTTATTTAATTTATAGACAATATAGAGACAAACAGAATAGAAAATTCATTGCAAAG\n+AGCCTATAAAAAATGAAAAGAACACTATTTGATATAATGTTCTTTTCATCTTTTTAAAGT\n+ACCCATACCTGTAGAAACAAACCCAATAATCTCAGAAATTAGAACTTCTTGAGGGAGTGG\n+TCTATTCTGAACTTCCCATTTGTAAGTTACATTATAAATGGAAGTGCTCATCATTGTTGC\n+AACTAATTGGGCTCGTATTTGTTCATTTTCAGCTAATTCATGGTCTTTTATTAGCATCTC\n+ATAGATTATGGAATATAGCTCACAAGATATTTTATCTCCTATTAAAGGTAAAATAGAGGC\n+AAAACCGCACTTGCAGATGCTTTTTACCATTTCAAAGTACTCACAAACGCAAAGAATGAG\n+GCTACGTATAGTATTCTCATTTAGGCGCATCTCGCCTAGTAGTTTATGGGAAAAGATCTC\n+AGAGAATTTCTCTACTATGACGGTGTCCAGTAGTGCATATTTATCTGTAAAATGTGCATA\n+GAAGGTTGCACGGTTTATGGTGGCTGCGGTAGTAATATCCTTAACTGTAATGGAGTCAAA\n+GCCTTTTTTAGTAACTAGTGAAATAAAAGCGTTGATAATAAGATCATGGGTTCTTTTTAC\n+TCTAGGATTATTTATATTTACTGACATATCATTACCTCATTTAAAAGAATTTTAGTTAAA\n+CTCATATTGCAGATTTAGTTTTATGTAAGCAACACTTTATCAATTTTGTTGGTTGAAAAA\n+TGTATAAGCTAATTTTATAATCATTATAAGCAACACGAGTTGGCTTTGCAATATGTGTAG\n+CTATAATAATTGTTTGAAATAAATATTTATTTCTAGGGGGTTAAATATGTCAACTATAAA\n+TCCAAACTGTAAATGCATAAATAAAAGCTGTAAAAATCACGGGAATTGTAAAGCATGTCG\n+AGAGTTTCATAAATCACAATCATATCTTAATGAAACATATTGTCAGGCTGGTGCAGTTAA\n+ATTCATAACTAAAGGATTAATATCTAAGTTAAAGTTACATTCTTAGCACGCAACCTAGCA\n+ATTCAAATGCGGACTTACTATAAAATAAAAAGGGTGGGAGTGCTGAAATATGAATTTAGA\n+TAATCCGATTTTCAAGAAGCCAATTACAGAAGTTATTAAAGCAAGAACATCTATGAGGTC\n+ATATAATGGGGTGCCTTTGGACAAAACAATAGGCGATAGCATAATGGATGTCATAAATCA\n+GGTTAAAGCACCTTTTTGCACAAATATACGAGTTAAGATGATAAACTCTAAAGATTCAGA\n+TTTAAAGCTTGGAACCTATGGTATTATAAAAGGAACTTCTACCTTTATAGTCTCATCAAT\n+TTCAAAATCAGATAATGCATTAGTTGATTTAGGATATTTATTGGAAAGAGCAGTGCTGTA\n+TGCAACTGATTTAAATTTGGGTACATGTTGGCTAGGCGGTACCTTTAATAAAGGTCAATT\n+TGCAGAGGTTATGGAGCTTAAAGATAATGAAATCCTTCCAATAGTTATACCTGTAGGGTA\n+TCCTAGTGAAGCAAGAAGAGGTATGGATACTTTCGTTAGGTTTATGGCAGGTTCTAAAAA\n+TAGAAAAGCCTGGTCTAAACTATTCTTTGATGTAGATTTCAATACACCTTTAAAGGAGCT\n+GGAAAGCTTAGAGTATTTTATTCCCCTTCAAATGGTAAGGCTTGCGCCTTCAGCAGCAAA\n+CAAGCAACCTTGGAGAATAATTAAAAGTTCTAATTCATACGATTTCTATTTAGAACGAAG\n+TAAAGACTCAAAAAGTGATAGTTATAATGATATGCACAAGATAGATATTGGCATAGCTAT\n+GTGTCATTTTGAGTTAACTTCAGCAGAATTAGGGTTAGAGGGACATAGGGAGAAACTTAA\n+GCTTTCATCTCAAGATAATAAAAAATATATTATTTCTTGGGTTAGAAAATAGGGAAGCGA\n+ATTTGCTATAAATGAAATATATGTTGAATATGTAAGAGATATTGATTACTATAGCAAGTT\n+ATTTTCACATTTTTCTTCATGAACTTTTTTTCTTTCTTCGTTAATAATTTGCGCCGCGTT\n+AGGAGATGTATACTTCACTATTTTGGTATGAATAAGGTCTAAATTTTCATTGAGATCATT\n+TTAAAGGAGTTTTCTTTTTGACACAGAAACTAATTCCGCTTATTGCTGACAAAGGTGGGA\n+TTGTGAACACTTCAACAGGATTAACTCGTTTTACATCTCCAGGTTCCAGACTTTATGCTG\n+CTGCGAAAGGTTCAGTAGAAGTCTTCACTAGATATTTAGCAAAAGAATTAGGAACTAGAG\n+GTATAAGAGCGAATACGATAGCGCCTGGAGCTAGTAAAGGTGCCCCTTATTTTAGAAATC\n+GCTTCATTGATTTAGTTACTTATAAATTAGAGGATGAAGTAAATGTTACAGAAGGAGTAA\n+ATTCAGGATTAAGGAAAGATGCTGGCAAGCAATTTCTTGGGCAAGCTATAGTAGGATTGA\n+TAGAATGGTGGTTTATAAATGGAATGCCTTTTTCTCCTGATGTCTTGGTGGAACATTTAG\n+GAGAAGTGATAGAGAGAAATTTGTAATCATTAATTTAGACTGATTCAAATCCATTTAATT\n+ATAGATAGGAAATATTAAAGGAGGTTTAGAATAATTAATAGTGAACCATATCACAAATAA\n+AGGATAATGAGTAAGTTTGGGTAAAATTAGGGATAAGAAATAAATTTAATCCTAGTAAAG\n+CTTATTGTTGCATTACTAGGATATTTTAATACGTTCAATATTATAAAAATAGGTGCCAAA\n+ATTTTGACTCTTACGTTTAAATTGAAAAATTGTAATTGCTAGACTAAAGTATATTAAGGT\n+TAAAAAAATATAATTAGTGAGGTGAAATCAAATGTATATTATACTGATTGCAGGAATGCC\n+AGCAGTTGGTAAGACAACATTTGCTAACAAGGCATAGAGGTCACAGAATATCCTGAATCA\n+ATAAATCACAATGCTATTCCTGATCAAATAAAATGGGTTTCAGTGCAAATCACAAGCACT\n+GAAACCCATTTTAATTTGAAGCTTAACGTATATATTAAAATATCATTACTAATATTTTAG\n+GTAACAGCATGGAAACACCTACTGCATAAAGTAAATCAAGCCACATTACAGAACCAACCG\n+CGCCAATATAAATAAGACCAATGACAGGCGCTACTAATATTTTTATTAAAAATGATACTT\n+CTTTATTCTTTAAAATAGCATTTACAATACATTTTGCATCACCTGTACTTGGGAAGGCAT\n+GCATTAAAATAGAAAATCCAATCCAACAAATTAATAGATTAATTAAGTTTGAATATTGTC\n+CAAATTCTACTATTTCAATTGATACTGGTAATGTAATTAGAGCTCCTATGATAGTATTTA\n+TTAAAAATGGACCTACTGAAATAAAAAATACTTTTAATGGATTATCACTAGGTTCGTGCA\n+AAACATATCCACAAGGATTTGATAATTGGAAATATTTAACTTCATAGACTGGCACTCTCA\n+TTAGTCTGCAAAATATCTGATGTGCTAGTTCATGAACAATTACTCCAGGAAAGGTTACAA\n+TAGAAATTAAAAAACCAGGTATTATCATTAATTTTGCCCTCCATTACTTAGGTAATAATC\n+ATTTATATTTATTTTTCCATTAAAATAATCTAATGTAGTATTATCAAATTTGTTTCCTGT\n+AGAAAGATATTCATCTTTGATTTTATTAACTTCTTCAGTT\n'
b
diff -r 000000000000 -r 68a3648c7d91 fosm_cluster/sim1_galaxy.fasta.stats
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fosm_cluster/sim1_galaxy.fasta.stats Thu Dec 22 04:45:31 2016 -0500
b
b'@@ -0,0 +1,19 @@\n+NODE_2_length_40000_cov_63.1617_ID_3\t0.659475\t63.1617\t0.00130009750731305\t0.00138760407030527\t0.00163762282171163\t0.00116258719403955\t0.00220016501237593\t0.00250018751406355\t0.00296272220416531\t0.00195014626096957\t0.00261269595219641\t0.00411280846063455\t0.00402530189764232\t0.00156261719628972\t0.000825061879640973\t0.00127509563217241\t0.00152511438357877\t0.000750056254219066\t0.00188764157311798\t0.00152511438357877\t0.00240018001350101\t0.00178763407255544\t0.00430032252418931\t0.00366277470810311\t0.00590044253318999\t0.00387529064679851\t0.00348776158211866\t0.0050628797159787\t0.00410030752306423\t0.00237517813836038\t0.000700052503937795\t0.00160012000900068\t0.00266269970247769\t0.00156261719628972\t0.00272520439032927\t0.00207515563667275\t0.00182513688526639\t0.00222516688751656\t0.00538790409280696\t0.00536290221766633\t0.00685051378853414\t0.00360027002025152\t0.00376278220866565\t0.00747556066705003\t0.00377528314623597\t0.00387529064679851\t0.0011250843813286\t0.00200015001125084\t0.00145010875815686\t0.00195014626096957\t0.000537540315523664\t0.00123759281946146\t0.00100007500562542\t0.000775058129359702\t0.0037252793959547\t0.00357526814511088\t0.00613796034702603\t0.00222516688751656\t0.00272520439032927\t0.00390029252193915\t0.00410030752306423\t0.00178763407255544\t0.000312523439257944\t0.00172512938470385\t0.00117508813160987\t0.00116258719403955\t0.000762557191789384\t0.00396279720979073\t0.00390029252193915\t0.00117508813160987\t0.00151261344600845\t0.00666299972497937\t0.00388779158436883\t0.00145010875815686\t0.00231267345050879\t0.0101132584943871\t0.00932569942745706\t0.00266269970247769\t0.000762557191789384\t0.00550041253093982\t0.00472535440158012\t0.00152511438357877\t0.00213766032452434\t0.00461284596344726\t0.00952571442858214\t0.00410030752306423\t0.00356276720754057\t0.00317523814286071\t0.00700052503937795\t0.00377528314623597\t0.00802560192014401\t0.0121259094432082\t0.0114758606895517\t0.00410030752306423\t0.000975073130484786\t0.00456284221316599\t0.00932569942745706\t0.00402530189764232\t0.00573793034477586\t0.00880066004950371\t0.00772557941845638\t0.00613796034702603\t0.00585043878290872\t0.0174763107233042\t0.0122009150686301\t0.00685051378853414\t0.00503787784083806\t0.0177888341625622\t0.00700052503937795\t0.00590044253318999\t0.00187514063554767\t0.00630047253544016\t0.00388779158436883\t0.00296272220416531\t0.000162512188414131\t0.00238767907593069\t0.000150011250843813\t0.00100007500562542\t0.00186263969797735\t0.00362527189539215\t0.00772557941845638\t0.00182513688526639\t0.00345025876940771\t0.00903817786333975\t0.00952571442858214\t0.00240018001350101\t0.000262519688976673\t0.00651298847413556\t0.00390029252193915\t0.00163762282171163\t0.00305022876715754\t0.00391279345950946\t0.00651298847413556\t0.00172512938470385\t0.00295022126659499\t0.00625046878515889\t0.00630047253544016\t0.00200015001125084\t0.00311273345500913\t0.00576293221991649\t0.00456284221316599\t0.00160012000900068\t0.00153761532114909\t0.00735055129134685\t0.00550041253093982\t0.00127509563217241\t0.00370027752081406\t0.00465034877615821\t0.00903817786333975\t0.00390029252193915\t0.00866314973623022\t0.00813811035827687\t0.0177888341625622\t0.00747556066705003\t0.00891316848763657\t0.0162762207165537\t0.0121259094432082\t0.0050628797159787\t0.00121259094432082\t0.00576293221991649\t0.0101132584943871\t0.00411280846063455\t0.00406280471035328\t0.00391279345950946\t0.00362527189539215\t0.00357526814511088\t0.00645048378628397\t0.0127759581968648\t0.0174763107233042\t0.00536290221766633\t0.00253769032677451\t0.00813811035827687\t0.00317523814286071\t0.00366277470810311\t0.00232517438807911\t0.00625046878515889\t0.00666299972497937\t0.00250018751406355\t0.000500037502812711\t0.00215016126209466\t0.00238767907593069\t0.00123759281946146\t0.00271270345275896\t0.00391279345950946\t0.00880066004950371\t0.00207515563667275\t0.00272520439032927\t0.00465034877615821\t0.00461284596344726\t0.00152511438357877\t0.000350026251968898\t0.00391279345950946\t0.00396279720979073\t0.00138760407030527\t0.000375028127109533\t0.000350026251968898\t0.000262519688976673\t0.000312523439257944\t0.000'..b'538821234778\t0.00372583831362056\t0.00331324548023305\t0.001662874146683\t0.000225050636393188\t0.00150033757595459\t0.00110024755570003\t0.000650146282913656\t0.000462604085919332\t0.00425095646520467\t0.00316321172263759\t0.00110024755570003\t0.0011377559950989\t0.00795178915255933\t0.00371333550048761\t0.00121277287389663\t0.00210047260633643\t0.0114775824560526\t0.0112400290065265\t0.00277562451551599\t0.000512615338451152\t0.0050761421319797\t0.00357580455602511\t0.00126278412642845\t0.00205046135380461\t0.00523867870270811\t0.011915180915706\t0.00331324548023305\t0.00387587207121602\t0.00250056262659098\t0.00665149658673201\t0.00445100147533195\t0.00772673851616614\t0.0134530269310595\t0.0113275486984572\t0.00416343677327399\t0.00078767722737616\t0.00490110274811833\t0.0112400290065265\t0.00376334675301943\t0.00547623215223425\t0.00870195794053662\t0.00773924132929909\t0.00543872371283539\t0.00592633342502063\t0.0197919531894676\t0.0131029481633368\t0.007764246955565\t0.00416343677327399\t0.0193543547298142\t0.00665149658673201\t0.00650146282913656\t0.00183791353054437\t0.00618889250081268\t0.00371333550048761\t0.00332574829336601\t6.25140656647746e-05\t0.00278812732864895\t0.000325073141456828\t0.00071266034857843\t0.00147533194968868\t0.00338826235903078\t0.00773924132929909\t0.00178790227801255\t0.00362581580855693\t0.00942712110224801\t0.011915180915706\t0.00262559075792053\t0.000250056262659098\t0.00553874621789903\t0.00316321172263759\t0.00120027006076367\t0.00235052886899552\t0.00456352679352854\t0.00553874621789903\t0.00150033757595459\t0.00247555700032507\t0.00676402190492861\t0.00618889250081268\t0.00178790227801255\t0.0021629866720012\t0.0059888474906854\t0.00490110274811833\t0.00133780100522618\t0.000812682853642069\t0.00612637843514791\t0.0050761421319797\t0.00118776724763072\t0.00395088895001375\t0.00493861118751719\t0.00942712110224801\t0.00372583831362056\t0.00973969143057188\t0.00847690730414343\t0.0193543547298142\t0.00816433697581956\t0.00887699732439799\t0.0191543097196869\t0.0134530269310595\t0.00510114775824561\t0.001387812257758\t0.0059888474906854\t0.0114775824560526\t0.00327573704083419\t0.00410092270760921\t0.00403840864194444\t0.00338826235903078\t0.00311320047010577\t0.00651396564226951\t0.0137781000725163\t0.0197919531894676\t0.00563876872296267\t0.00260058513165462\t0.00847690730414343\t0.00250056262659098\t0.00390087769748193\t0.00245055137405916\t0.00676402190492861\t0.00795178915255933\t0.00235052886899552\t0.000375084393988647\t0.00217548948513416\t0.00278812732864895\t0.000750168787977295\t0.00270060763671826\t0.00403840864194444\t0.00870195794053662\t0.0017753994648796\t0.00250056262659098\t0.00493861118751719\t0.00523867870270811\t0.00133780100522618\t0.000225050636393188\t0.00456352679352854\t0.00425095646520467\t0.00111275036883299\t0.000150033757595459\t0.000225050636393188\t0.000250056262659098\t0.000225050636393188\t0.00101272786376935\t0.00245055137405916\t0.00183791353054437\t0.00078767722737616\t0.000937710984971619\t0.001387812257758\t0.00078767722737616\t0.000775174414243205\t0.000225050636393188\t0.000812682853642069\t0.000512615338451152\t0.000550123777850016\t0.00163786852041709\t0.00250056262659098\t0.00362581580855693\t0.00172538821234778\t0.00420094521267285\t0.00260058513165462\t0.00416343677327399\t0.00367582706108875\t0.00702658098072066\t0.00887699732439799\t0.00772673851616614\t0.00372583831362056\t0.000937710984971619\t0.0021629866720012\t0.00210047260633643\t0.00207546698007052\t0.00236303168212848\t0.00270060763671826\t0.00147533194968868\t0.00295066389937736\t0.00385086644495011\t0.00651396564226951\t0.00592633342502063\t0.00575129404115926\t0.00420094521267285\t0.00973969143057188\t0.00387587207121602\t0.00470105773799105\t0.00101272786376935\t0.00247555700032507\t0.0011377559950989\t0.00233802605586257\t0.000175039383861369\t0.000375084393988647\t6.25140656647746e-05\t0.000237553449526143\t0.00236303168212848\t0.00410092270760921\t0.00547623215223425\t0.00201295291440574\t0.00163786852041709\t0.00395088895001375\t0.00205046135380461\t0.00133780100522618\t0.000150033757595459\t0.00235052886899552\t0.000462604085919332\t0.000462604085919332\tsim1_galaxy\n'
b
diff -r 000000000000 -r 68a3648c7d91 fosm_cluster/test
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fosm_cluster/test Thu Dec 22 04:45:31 2016 -0500
b
@@ -0,0 +1,20 @@
+"x"
+"NODE_2_length_40000_cov_63.1617_ID_3" 9
+"NODE_3_length_40000_cov_63.0619_ID_5" 6
+"NODE_10_length_39995_cov_63.156_ID_19" 1
+"NODE_12_length_39995_cov_63.3136_ID_23" 7
+"NODE_18_length_37282_cov_67.8523_ID_35" 8
+"NODE_5_length_39999_cov_63.277_ID_9" 5
+"NODE_16_length_39898_cov_63.337_ID_31" 4
+"NODE_1_length_40000_cov_62.8079_ID_1" 7
+"NODE_17_length_39819_cov_63.5061_ID_33" 6
+"NODE_9_length_39996_cov_63.0617_ID_17" 9
+"NODE_7_length_39998_cov_63.2738_ID_13" 8
+"NODE_6_length_39999_cov_63.2183_ID_11" 1
+"NODE_8_length_39997_cov_62.8364_ID_15" 3
+"NODE_19_length_33054_cov_76.4496_ID_37" 2
+"NODE_15_length_39993_cov_62.9495_ID_29" 5
+"NODE_4_length_39999_cov_63.2942_ID_7" 8
+"NODE_14_length_39993_cov_63.1132_ID_27" 5
+"NODE_11_length_39995_cov_63.1687_ID_21" 3
+"NODE_13_length_39994_cov_62.9723_ID_25" 9
b
diff -r 000000000000 -r 68a3648c7d91 mytrimmer/aaa.fplot
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mytrimmer/aaa.fplot Thu Dec 22 04:45:31 2016 -0500
b
@@ -0,0 +1,9 @@
+#-- forward hits sorted by %sim
+0 0 0
+0 0 0
+
+
+1 1 100
+2100 2100 100
+
+
b
diff -r 000000000000 -r 68a3648c7d91 mytrimmer/aaa.gp
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mytrimmer/aaa.gp Thu Dec 22 04:45:31 2016 -0500
[
@@ -0,0 +1,21 @@
+set terminal png tiny size 800,800
+set output "aaa.png"
+set size 1,1
+set grid
+unset key
+set border 15
+set tics scale 0
+set xlabel "Assembly"
+set ylabel "Assembly_22"
+set format "%.0f"
+set mouse format "%.0f"
+set mouse mouseformat "[%.0f, %.0f]"
+set mouse clipboardformat "[%.0f, %.0f]"
+set xrange [1:2100]
+set yrange [1:2100]
+set style line 1  lt 1 lw 3 pt 6 ps 1
+set style line 2  lt 3 lw 3 pt 6 ps 1
+set style line 3  lt 2 lw 3 pt 6 ps 1
+plot \
+ "aaa.fplot" title "FWD" w lp ls 1, \
+ "aaa.rplot" title "REV" w lp ls 2
b
diff -r 000000000000 -r 68a3648c7d91 mytrimmer/aaa.png
b
Binary file mytrimmer/aaa.png has changed
b
diff -r 000000000000 -r 68a3648c7d91 mytrimmer/aaa.rplot
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mytrimmer/aaa.rplot Thu Dec 22 04:45:31 2016 -0500
b
@@ -0,0 +1,5 @@
+#-- reverse hits sorted by %sim
+0 0 0
+0 0 0
+
+
b
diff -r 000000000000 -r 68a3648c7d91 mytrimmer/f1
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mytrimmer/f1 Thu Dec 22 04:45:31 2016 -0500
b
@@ -0,0 +1,31 @@
+>Assembly
+AGAATTCGTCTTGCTCTATTCACCCTTACTTTTCTTCTTGCCCGTTCTCTTTCTTAGTATGAATCCAGTA
+TGCCTGCCTGTAATTGTTGCGCCCTACCTCTTTTGGCTGGCGGCTATTGCCGCCTCGTGTTTCACGGCCT
+CAGTTAGTACCGTTGTGACCGCCACCGGCTTGGCCCTCTCACTTCTACTCTTGGCAGCAGTGGCCAGCTC
+ATATGCCGCTGCACAAAGGAAACTGCTGACACCGGTGACAGTGCTTACTGCGGTTGTCACTTGTGAGTAC
+ACACGCACCATTTACAATGCATGATGTTCGTGAGATTGATCTGTCTCTAACAGTTCACTTCCTCTGCTTT
+TCTCCTCAGTCTTTGCAATTTGCCTAACATGGAGGATTGAGGACCCACCTTTTAATTCTCTTCTGTTTGC
+ATTGCTGGCCGCAGCTGGCGGACTACAAGGCATTTACGGTTAGTGTGCCTCTGTTATGAAATGCAGGTTT
+GACTTCATATGTATGCCTTGGCATGACGTCAACTTTACTTTTATTTCAGTTCTGGTGATGCTTGTGCTCC
+TGATACTAGCGTACAGAAGGAGATGGCGCCGTTTGACTGTTTGTGGCGGCATCATGTTTTTGGCATGTGT
+ACTTGTCCTCATCGTCGACGCTGTTTTGCAGCTGAGTCCCCTCCTTGGAGCTGTAACTGTGGTTTCCATG
+ACGCTGCTGCTACTGGCTTTCGTCCTCTGGCTCTCTTCGCCAGGGGGCCTAGGTACTCTTGGTGCAGCCC
+TTTTAACATTGGCAGCAGGTAAGCCACACGTGTGACATTGCTTGCCTTTTTGCCACATGTTTTCTGGACA
+CAGGACTAACCATGCCATCTCTGATTATAGCTCTGGCACTGCTAGCGTCACTGATTTTGGGCACACTTAA
+CTTGACTACAATGTTCCTTCTCATGCTCCTATGGACACTTGGTAAGTTTTCCCTTCCTTTAACTCATTAC
+TTGTTCTTTTGTAATCGCAGCTCTAACTTGGCATCTCTTTTACAGTGGTTCTCCTGATTTGCTCTTCGTG
+CTCTTCATGTCCACTGAGCAAGATCCTTCTGGCACGACTGTTCCTATATGCTCTCGCACTCTTGTTGCTA
+GCCTCCGCGCTAATCGCTGGTGGCAGTATTTTGCAAACAAACTTCAAGAGTTTAAGCAGCACTGAATTTA
+TACCCAGTGAGTATCTATTTGTTACTCCTGTTTAGTTGAAGAAAACAAGCTATTGGATTGTAACACACAT
+TTTACGCTTTGTTCCTTAGATTTGTTCTGCATGTTATTACTGATTGTCGCTGGCATACTCTTCATTCTTG
+CTATCCTGACCGAATGGGGCAGTGGAAATAGAACATACGGTCCAGTTTTTATGTGCCTCGGTGGCCTGCT
+CACCATGGTAGCCGGCGCTGTGTGGCTGACGGTGATGTCTAACACGCTTTTGTCTGCCTGGATTCTTACA
+GCAGGATTCCTGATTTTCCTCATTGGTAAGTGTGACACCAACAGGTGTTGCCTTGTTATGTCACCGTTCT
+GACACATGACTTACATGGGTTTGGCTTTTGTAGGCTTTGCCCTCTTTGGGGTCATTAGATGCTGCCGCTA
+CTGCTGCTACTACTGCCTTACACTGGAAAGTGAGGAGCGCCCACCGACCCCATATCGCAACACTGTATAA
+AGGTAAGTATTATTAAATTTTAGAGACACTATCACGTGTAACTTGACGTGCAAGGATGGAAGAGAGGGGC
+AGGGAAACGCAAATGCCGGTTGCCCGGTATGGGGGCCCGTTTATTATGGTAAGGCTCTTCGGGCAAGATG
+GAGAGGCAAACATACAGGAGGAAAGGCTATATGAGCTACTCTCTGACCCACGCTCCGCGCTCGGCCTAGA
+CCCGGGGCCCCTGATTGCTGAGAACCTGCTGCTAGTGGCGCTGCGTGGCACCAACAACGATCCCAGGCCT
+CAGCGTCAGGAGAGGGCCAGAGAACTGGCCCTCGTTGGCATTCTACTAGGAAACGGCGAGCAGGGTGAAC
+ACTTGGGCACGGAGAGTGCCCTGGAGGCCTCAGGCAACAACTATGTGTATGCCTACGGACCAGACTGGAT
b
diff -r 000000000000 -r 68a3648c7d91 mytrimmer/f2
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mytrimmer/f2 Thu Dec 22 04:45:31 2016 -0500
b
@@ -0,0 +1,31 @@
+>Assembly_22
+AGAATTCGTCTTGCTCTATTCACCCTTACTTTTCTTCTTGCCCGTTCTCTTTCTTAGTATGAATCCAGTA
+TGCCTGCCTGTAATTGTTGCGCCCTACCTCTTTTGGCTGGCGGCTATTGCCGCCTCGTGTTTCACGGCCT
+CAGTTAGTACCGTTGTGACCGCCACCGGCTTGGCCCTCTCACTTCTACTCTTGGCAGCAGTGGCCAGCTC
+ATATGCCGCTGCACAAAGGAAACTGCTGACACCGGTGACAGTGCTTACTGCGGTTGTCACTTGTGAGTAC
+ACACGCACCATTTACAATGCATGATGTTCGTGAGATTGATCTGTCTCTAACAGTTCACTTCCTCTGCTTT
+TCTCCTCAGTCTTTGCAATTTGCCTAACATGGAGGATTGAGGACCCACCTTTTAATTCTCTTCTGTTTGC
+ATTGCTGGCCGCAGCTGGCGGACTACAAGGCATTTACGGTTAGTGTGCCTCTGTTATGAAATGCAGGTTT
+GACTTCATATGTATGCCTTGGCATGACGTCAACTTTACTTTTATTTCAGTTCTGGTGATGCTTGTGCTCC
+TGATACTAGCGTACAGAAGGAGATGGCGCCGTTTGACTGTTTGTGGCGGCATCATGTTTTTGGCATGTGT
+ACTTGTCCTCATCGTCGACGCTGTTTTGCAGCTGAGTCCCCTCCTTGGAGCTGTAACTGTGGTTTCCATG
+ACGCTGCTGCTACTGGCTTTCGTCCTCTGGCTCTCTTCGCCAGGGGGCCTAGGTACTCTTGGTGCAGCCC
+TTTTAACATTGGCAGCAGGTAAGCCACACGTGTGACATTGCTTGCCTTTTTGCCACATGTTTTCTGGACA
+CAGGACTAACCATGCCATCTCTGATTATAGCTCTGGCACTGCTAGCGTCACTGATTTTGGGCACACTTAA
+CTTGACTACAATGTTCCTTCTCATGCTCCTATGGACACTTGGTAAGTTTTCCCTTCCTTTAACTCATTAC
+TTGTTCTTTTGTAATCGCAGCTCTAACTTGGCATCTCTTTTACAGTGGTTCTCCTGATTTGCTCTTCGTG
+CTCTTCATGTCCACTGAGCAAGATCCTTCTGGCACGACTGTTCCTATATGCTCTCGCACTCTTGTTGCTA
+GCCTCCGCGCTAATCGCTGGTGGCAGTATTTTGCAAACAAACTTCAAGAGTTTAAGCAGCACTGAATTTA
+TACCCAGTGAGTATCTATTTGTTACTCCTGTTTAGTTGAAGAAAACAAGCTATTGGATTGTAACACACAT
+TTTACGCTTTGTTCCTTAGATTTGTTCTGCATGTTATTACTGATTGTCGCTGGCATACTCTTCATTCTTG
+CTATCCTGACCGAATGGGGCAGTGGAAATAGAACATACGGTCCAGTTTTTATGTGCCTCGGTGGCCTGCT
+CACCATGGTAGCCGGCGCTGTGTGGCTGACGGTGATGTCTAACACGCTTTTGTCTGCCTGGATTCTTACA
+GCAGGATTCCTGATTTTCCTCATTGGTAAGTGTGACACCAACAGGTGTTGCCTTGTTATGTCACCGTTCT
+GACACATGACTTACATGGGTTTGGCTTTTGTAGGCTTTGCCCTCTTTGGGGTCATTAGATGCTGCCGCTA
+CTGCTGCTACTACTGCCTTACACTGGAAAGTGAGGAGCGCCCACCGACCCCATATCGCAACACTGTATAA
+AGGTAAGTATTATTAAATTTTAGAGACACTATCACGTGTAACTTGACGTGCAAGGATGGAAGAGAGGGGC
+AGGGAAACGCAAATGCCGGTTGCCCGGTATGGGGGCCCGTTTATTATGGTAAGGCTCTTCGGGCAAGATG
+GAGAGGCAAACATACAGGAGGAAAGGCTATATGAGCTACTCTCTGACCCACGCTCCGCGCTCGGCCTAGA
+CCCGGGGCCCCTGATTGCTGAGAACCTGCTGCTAGTGGCGCTGCGTGGCACCAACAACGATCCCAGGCCT
+CAGCGTCAGGAGAGGGCCAGAGAACTGGCCCTCGTTGGCATTCTACTAGGAAACGGCGAGCAGGGTGAAC
+ACTTGGGCACGGAGAGTGCCCTGGAGGCCTCAGGCAACAACTATGTGTATGCCTACGGACCAGACTGGAT
b
diff -r 000000000000 -r 68a3648c7d91 mytrimmer/out.fplot
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mytrimmer/out.fplot Thu Dec 22 04:45:31 2016 -0500
b
@@ -0,0 +1,9 @@
+#-- forward hits sorted by %sim
+0 0 0
+0 0 0
+
+
+1 1 100
+2100 2100 100
+
+
b
diff -r 000000000000 -r 68a3648c7d91 mytrimmer/out.gp
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mytrimmer/out.gp Thu Dec 22 04:45:31 2016 -0500
[
@@ -0,0 +1,21 @@
+set terminal png tiny size 800,800
+set output "out.png"
+set size 1,1
+set grid
+unset key
+set border 15
+set tics scale 0
+set xlabel "Assembly"
+set ylabel "Assembly_22"
+set format "%.0f"
+set mouse format "%.0f"
+set mouse mouseformat "[%.0f, %.0f]"
+set mouse clipboardformat "[%.0f, %.0f]"
+set xrange [1:2100]
+set yrange [1:2100]
+set style line 1  lt 1 lw 3 pt 6 ps 1
+set style line 2  lt 3 lw 3 pt 6 ps 1
+set style line 3  lt 2 lw 3 pt 6 ps 1
+plot \
+ "out.fplot" title "FWD" w lp ls 1, \
+ "out.rplot" title "REV" w lp ls 2
b
diff -r 000000000000 -r 68a3648c7d91 mytrimmer/out.png
b
Binary file mytrimmer/out.png has changed
b
diff -r 000000000000 -r 68a3648c7d91 mytrimmer/out.rplot
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mytrimmer/out.rplot Thu Dec 22 04:45:31 2016 -0500
b
@@ -0,0 +1,5 @@
+#-- reverse hits sorted by %sim
+0 0 0
+0 0 0
+
+
b
diff -r 000000000000 -r 68a3648c7d91 mytrimmer/test.delta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mytrimmer/test.delta Thu Dec 22 04:45:31 2016 -0500
b
@@ -0,0 +1,5 @@
+/home/inmare/galaxy/tools/mytrimmer/f1 /home/inmare/galaxy/tools/mytrimmer/f2
+NUCMER
+>Assembly Assembly_22 2100 2100
+1 2100 1 2100 0 0 0
+0
b
diff -r 000000000000 -r 68a3648c7d91 mytrimmer/trim.seqs.C.cpp
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mytrimmer/trim.seqs.C.cpp Thu Dec 22 04:45:31 2016 -0500
[
@@ -0,0 +1,208 @@
+#include <iostream>
+#include <fstream>
+#include <algorithm>
+#include <vector>
+#include <map>
+#include <math.h>
+#include <string>
+
+using namespace std;
+int main (int argc, char *argv[]);
+int eval_quality(string & qstring,int lencutoff,int errors);
+
+int main (int argc, char *argv[])
+{
+ if (argc==9)
+ {
+ unsigned long inseq=0;
+ unsigned long outseq=0;
+ unsigned long pfile=0; 
+ ifstream infile;
+ ifstream infileP;
+ string file=argv[1];
+ string filep=argv[2];
+ ofstream outfile;
+ ofstream outfilep;
+ ofstream outfileunm;
+ string outname=(argv[6]);
+ string outnamep=(argv[7]);
+ string outunm=(argv[8]);
+ outfile.open(outname.c_str());
+ outfilep.open(outnamep.c_str());
+ outfileunm.open(outunm.c_str());
+ int cutoff=atoi(argv[3]);
+ int errors=atoi(argv[4]);
+ int discard=atoi(argv[5]);
+ infile.open(file.c_str());
+ infileP.open(filep.c_str());
+ if (!infile)
+ {
+ cerr << "Couldn't open "<< infile << "\n"; 
+ exit(1);
+ }
+ if (!infileP)
+ {
+ cerr << "Couldn't open "<< outfile << "\n"; 
+ exit(1);
+ }
+ map <int,int> Min;
+ map <int,int> Max;
+ if (infile.is_open() && infileP.is_open()){
+ string header;
+ string seq;
+ string seqp;
+ string qscore;
+ string qscorep;
+ while (!infile.eof() && !infileP.eof())
+ {
+ getline(infile,header);
+ if (header!="")
+ {
+ //read headers + sequences
+ getline(infile,seq); //
+ getline(infileP,seqp);
+ getline(infileP,seqp);//
+ //
+ //cout <<"A:" << seq << "\n" << "B:" << seqp << "\n";
+ inseq+=seq.length();
+ inseq+=seqp.length();
+
+
+ //read Qscores
+ getline(infile,qscore);
+ getline(infile,qscore);//
+ getline(infileP,qscorep);
+ getline(infileP,qscorep);//
+ if (discard >0 && discard<=seq.length())
+ {
+ seq=seq.substr(discard-1,seq.length()-discard);
+ seqp=seqp.substr(discard-1,seqp.length()-discard);
+ qscore=qscore.substr(discard-1,qscore.length()-discard);
+ qscorep=qscorep.substr(discard-1,qscorep.length()-discard);
+ }
+ if (qscore.length()!=seq.length())
+ {
+ cerr << "Invalid fastq\n" << seq << "\n" << qscore  << "\n";
+ exit(1);
+ }
+
+ if (qscorep.length()!=seqp.length())
+                                {
+                                        cerr << "Invalid fastq\n" << seqp << "\n" << qscorep  << "\n";
+ exit(1);
+                                }
+
+
+ //
+ //cout << qscore << "\n" << qscorep << "\n";
+
+ //eval Qscores
+ int p=eval_quality(qscore,cutoff,errors);
+ int pp=eval_quality(qscorep,cutoff,errors);
+
+ string Qheader=header;
+                                if (*(Qheader.end()-2)=='/') // togli gli slash
+                                {
+                                 Qheader.replace(Qheader.end()-2,Qheader.end(),"");
+                                }
+                      string Oheader=Qheader;
+                      Oheader[0]='+';                   
+ if (p>0)
+ {
+ seq=seq.substr(0,p);
+ qscore=qscore.substr(0,p);
+ if (pp>0)
+ {
+ seqp=seqp.substr(0,pp);
+ qscorep=qscorep.substr(0,pp);
+ outseq+=seqp.length();
+ outseq+=seq.length();
+ outfile << Qheader <<"/1" << "\n" << seq << "\n" << Oheader <<"/1" << "\n" << qscore << "\n";
+ outfilep << Qheader <<"/2" << "\n" << seqp << "\n" << Oheader<<"/2"  << "\n" << qscorep << "\n";
+ }else{
+ outseq+=seq.length();
+ outfileunm << Qheader <<"/1"  << "\n" << seq << "\n" << Oheader<<"/1"  << "\n" << qscore << "\n";
+ }
+ }else if(p==0 && pp>0){
+ seqp=seqp.substr(0,pp);
+ qscorep=qscorep.substr(0,pp);
+ outseq+=seqp.length();
+ outfileunm << Qheader <<"/2"  << "\n" << seqp << "\n" << Oheader <<"/2"  << "\n" << qscorep << "\n";
+ }
+ }
+ }
+
+ }else{
+ cerr << "could not open files\n";
+ }
+ //cerr << "Input "<< inseq << " bases.\nOutput " << outseq << " bases.\n";
+ }else{
+
+ cout << "input: <first_file> <second_file> <len_cutoff> <number of errors> <low qual base> <ofile1 <ofile2> <ofile3>\n";
+ }
+}
+
+int eval_quality(string & qstring,int lencutoff,int errors)
+{
+ int Nminori10=0;
+ int Nminori20=0;
+ int Nmaggiori25=0;
+ int l10=0;
+ int l20=0;
+ int p=0;
+ double total_perr=0;
+ string::iterator pos;
+ for (pos=qstring.begin();pos!=qstring.end();pos++)
+ {
+ int punteggio=static_cast<int> (*pos)-33;
+ if (punteggio>=1 && punteggio <=41)
+ {
+ double exp=(double)punteggio/-10;
+ total_perr+=pow(10,exp);
+ if (p>0)
+                 {
+                 if (punteggio<=10) //count qscores <=10
+                         {
+                          l10++;
+                                 Nminori20++;
+                                 Nminori10++;
+                         }else if (punteggio<=20){ // count Qscores <=20
+                         l20++;
+                         Nminori10=0;
+                         Nminori20++;
+                         }else if (punteggio>20){
+                         Nminori20=0;
+                         Nminori10=0;
+ if (punteggio>=25)
+ {
+ Nmaggiori25++;
+ }
+                         }
+ }
+ if (Nminori10>=10) // 3 or more consecutives very low quality bases
+                 {
+                 p-=Nminori10;
+                        break;
+                 }else if (Nminori20>=15){ // 5 or more consecutives low quality bases 
+                 p-=Nminori20;
+                         break;
+ }
+                 if (total_perr>=(double)errors) // sum of per base error probability when 5e-2 5 wrong base calls in 100 
+                 {
+ //cout << p << " " << total_perr << " " << errors << "\n";
+                 break;
+                 }
+ p++;
+ }else{
+ cerr << "Invalid Qscore" << *pos << "=" << punteggio << "\n";
+ exit(1);
+ }
+ }
+ double prop_gr_25=(double)Nmaggiori25/(double)(p);
+        if (prop_gr_25>=0.35 && p>=lencutoff && l20 <= p*0.2 && l10 <= p*0.1) // if 50% of Qscores are >= 25,size is >= cutoff a 
+ {
+ return p;
+ }else{
+ return 0;
+ }
+}
b
diff -r 000000000000 -r 68a3648c7d91 mytrimmer/trimPE
b
Binary file mytrimmer/trimPE has changed
b
diff -r 000000000000 -r 68a3648c7d91 mytrimmer/trimPE.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mytrimmer/trimPE.xml Thu Dec 22 04:45:31 2016 -0500
b
@@ -0,0 +1,27 @@
+<tool id="trimmer" name="Custom quality trimmer" version="0.">
+ <description>trimming tool for Illumina PE data</description>
+ <command> /home/inmare/galaxy/tools/mytrimmer/trimPE $f1 $f2 $l $ne $disc  $o1 $o2 $o3</command>
+ <description> "approved by the boss" </description>
+ <inputs>
+  <param name="f1" type="data" format="fastq" label="R1 reads file" help="fastq only"/>
+  <param name="f2" type="data" format="fastq" label="R2 reads file" help="fastq only"/>
+  <param name="l" type="integer" label="minimum read length" value="100"  help="don't go too short!"/>
+  <param name="ne" type="integer" label="max number of errors" value="10"  help="usually between 2 and 10"/>
+  <param name="disc" type="integer" label="cut xx bp from the beginning of each read" value="15"  help="have a look to the fastqc report"/>
+ </inputs>
+ <outputs>
+  <data name="o1" ftype="fastq" format="fastq" label="r1 trimmed reads"/>
+  <data name="o2" ftype="fastq" format="fastq" label="r2 trimmed reads"/>
+  <data name="o3" ftype="fastq" format="fastq" label="unmated reads"/>
+ </outputs>
+ <test/>
+ <help>
+ Our custom script for quality trimming implements strict quality filters based on the provided base call quality scores. Reads are iteratively trimmed from the 3' end until all of the following conditions were satisfied:
+1. the median quality score (Qscore) of upstream bases is â‰¥15
+2. less than 10 bases with Qscore â‰¤10 and less than 15 bases with Qscore â‰¤â€Š20 are present in the upstream sequence
+3. the cumulative error probability in the upstream region is below a user defined cutoff
+4. the length of the trimmed read exceeds a user defined cutoff
+
+The program is designed to work with paired end sequencing files only, the output consists in 3 distinct files containing the pairs where both mate passed the filters (output to 2 separate files) and a third file containing all the singleton reads, for which the corresponding mate did not pass the quality filters.
+ </help>
+</tool>
b
diff -r 000000000000 -r 68a3648c7d91 pfamScan/Bio/Pfam/Active_site/as_search.pm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pfamScan/Bio/Pfam/Active_site/as_search.pm Thu Dec 22 04:45:31 2016 -0500
[
b'@@ -0,0 +1,399 @@\n+package Bio::Pfam::Active_site::as_search;\n+\n+use strict;\n+use warnings;\n+\n+use Bio::SeqFeature::Generic;\n+use Bio::SimpleAlign;\n+use Bio::Pfam::Scan::Seq;\n+\n+=head2 find_as\n+\n+ Title   : find_as\n+ Usage   : find_as($as_aln, $as_res, $seq_id, $seq_se, $seq_region, $family, $hmm_file)\n+ Function: finds active sites in a query sequence which \n+           has a match to a Pfam active site family\n+\n+ Returns : An array reference of active site postions\n+ Args    : Alignment object of active site sequences, hash of arrays containing seq ids => active site positions, \n+           start-end sequence in the format "3-50", sequence region, family,  file containing all Pfam models\n+\n+=cut\n+\n+sub find_as {\n+  my ($as_aln, $as_res, $seq_id, $seq_se, $seq_region, $family, $hmm_file) = @_;\n+\n+\n+  system("hmmfetch $hmm_file $family > /tmp/hmm.$$") and die "FATAL: Problem running [hmmfetch $hmm_file $family > /tmp/hmm.$$]\\n";\n+\n+  $seq_id = "Query_".$seq_id; \n+\n+  my $fasta;\n+  foreach my $seq ($as_aln->each_seq) {\n+      my $s = $seq->seq;\n+      $s =~ s/[\\-\\.]//g; #Remove gaps\n+\n+      $fasta .= ">" . $seq->id . "/" . $seq->start . "-" . $seq->end . "\\n$s\\n";\n+  }\n+  $fasta .= ">$seq_id/$seq_se\\n$seq_region";\n+  open(SEQ, ">/tmp/seqs.$$") or die "Couldn\'t open file seqs.$$ $!\\n";\n+  print SEQ $fasta;\n+  close SEQ;\n+ \n+\n+  open(OUT, "hmmalign --outformat Pfam /tmp/hmm.$$ /tmp/seqs.$$ |") or die "Couldn\'t open fh to hmmalign $!\\n";\n+\n+  my $aln = new Bio::SimpleAlign;\n+  my ($name, $start, $end, $seq);\n+  while(<OUT>) {\n+    if( /^(\\S+)\\/(\\d+)-(\\d+)\\s+(\\S+)\\s*/ ) {\n+\t$name = $1;\n+\t$start = $2;\n+\t$end = $3;\n+\t$seq = $4;\n+\t    \n+\t$aln->add_seq(Bio::Pfam::Scan::Seq->new(\'-seq\'=>$seq, \'-id\'=>$name, \'-start\'=>$start, \'-end\'=>$end, \'-type\'=>\'aligned\'));\n+    }\n+  }\n+  close OUT;\n+  \n+\n+  unlink "/tmp/seqs.$$";\n+  unlink "/tmp/hmm.$$";\n+  \n+  #Locate exp as in fam\n+  _exp_as($aln, $as_res);\n+\n+  #Store as patterns\n+  my $pattern_aln = new Bio::SimpleAlign;\n+  _pattern_info($aln, $pattern_aln);\n+  #find pred as\n+  my $array_ref = _add_pred_as($aln, $pattern_aln, $seq_id);\n+  return $array_ref;\n+}\n+\n+=head2 _exp_as\n+\n+ Title    : _exp_as\n+ Usage    : _exp_as($aln,  $hash_of_arrays)\n+ Function : Adds experimental active site data to alignment object\n+ Returns  : Nothing, populates the alignment object with active site residue info\n+ Args     : alignment object\n+\n+=cut\n+\n+sub _exp_as {\n+ \n+  my ($aln, $as_res) = @_;\n+\n+\n+  foreach my $seq ($aln->each_seq) {\n+\n+      foreach my $pos ( @{$as_res->{$seq->id}}) {\n+\n+        if($pos >= $seq->start and $pos <= $seq->end) { #Feature is in the alignment\n+                  \n+             #store column position for seq\n+             my $col = $aln->column_from_residue_number($seq->id, $pos);              \n+\n+             #add feature to seq\n+             my $aa .= uc substr($seq->seq(), $col-1, 1); \n+\n+             my $feat = new Bio::SeqFeature::Generic  (  -display_name => \'experimental\',\n+                                                         -primary => $aa,\n+\t\t\t\t\t\t\t -start => $col);\n+\n+\n+\n+\t     $seq->add_SeqFeature($feat);\n+\t }\n+\n+    }\n+  }\n+}\n+\n+\n+=head2 _pattern_info\n+\n+ Title    : _pattern_info\n+ Usage    : _pattern_info($aln_object, $aln_object)\n+ Function : Takes an alignment and extracts active site patterns into a second alignment\n+ Returns  : Nothing, populates a second alignment object with active site seqences\n+ Args     : alignment object, empty alignment object\n+\n+=cut\n+\n+\n+sub _pattern_info {\n+    my ($aln, $pattern_aln) = @_;\n+    my (%pat_col_seq);\n+  \n+    foreach my $seq ( $aln->each_seq() ) {  \n+\n+\tnext unless($seq->all_SeqFeatures());\n+           my ($pat, $col);\n+           foreach my $feat ( sort {$a->start <=> $b->start }  $seq->all_SeqFeatures() ) {            \n+              $pat .= $feat->primary_tag();   #HEK\n+              $col .= $feat->start() . " ";    #33 44 55\n+\t   }\n+\n+           unless(exists($pat_col_seq{"$pat:$col"})) {\n+\t       $pattern_aln->add_se'..b'h_seq() ) {\n+                   next if($sequence1 eq $sequence2);\n+\n+                   my ($seq1_st, $seq1_en, $seq2_st, $seq2_en);\n+\n+                   my (%hash1, %hash2, $num_1, $num_2, %smaller, %larger);\n+\n+                   #see if patterns overlap - find pattern start ends and collect column positions\n+                   foreach my $feat1 ($sequence1->all_SeqFeatures() ) {\n+\n+                       $seq1_st = $feat1->start() if(!$seq1_st or $feat1->start() < $seq1_st);\n+                       $seq1_en = $feat1->start() if(!$seq1_en or $feat1->start() > $seq1_en);\n+                   }\n+\n+                   foreach my $feat2 ($sequence2->all_SeqFeatures() ) {\n+\n+                       $seq2_st = $feat2->start() if(!$seq2_st or $feat2->start() < $seq2_st);\n+                       $seq2_en = $feat2->start() if(!$seq2_en or $feat2->start() > $seq2_en);\n+                   }\n+\n+                   #then see if patterns overlap - remove sequence with pattern of least identity\n+                   if(($seq1_st >= $seq2_st and $seq1_st <= $seq2_en) or ($seq2_st >= $seq1_st and $seq2_st <= $seq1_en)) {\n+                       my $remove = _identity($query_seq, $sequence1, $sequence2);\n+                       $seq_to_remove{$remove}= $remove;\n+                   }\n+             }\n+\n+           }\n+         }\n+\n+         #Now remove any patterns which need removing\n+         foreach my $remove (keys %seq_to_remove) {\n+           $aligns_with->remove_seq($seq_to_remove{$remove});\n+           $num_seq = $aligns_with->num_sequences();\n+           last if($num_seq eq "1"); #just in case the % identities are identical\n+         }\n+\n+\n+         $num_seq = $aligns_with->num_sequences();\n+         unless($num_seq >=1) {\n+            die "FATAL: All sequences that align with active site sequences have been removed - this should never happen\\n";\n+         }\n+\n+\n+\n+           #Add features to seq\n+           foreach my $sequence ($aligns_with->each_seq() ) {\n+                foreach my $feat ($sequence->all_SeqFeatures() ) {\n+\n+                   my $actual_pos = $query_seq->location_from_column($feat->start);\n+                   $actual_pos = $actual_pos->start();\n+\n+\n+                   push(@as_res, $actual_pos);\n+\n+ \n+\n+               }\n+           }\n+           return \\@as_res\n+\n+}\n+\n+\n+=head2 _identity\n+\n+ Title    : _identity\n+ Usage    : _identity($sequence1 , $sequence2, $sequence3)\n+ Function : Identifies seq with lowest % identity to sequence1\n+ Returns  : The sequence which has the lowest % id to sequence 1\n+ Args     : sequence1, sequence2, sequence3.\n+\n+=cut\n+\n+\n+sub _identity {\n+    my $seq1 = shift;\n+    my @aligns_with = @_;\n+          my $lower_identity=100;\n+          my $lower_identity_seq;\n+          foreach my $s (@aligns_with) {\n+             my $tmp_aln = new Bio::SimpleAlign;\n+             $tmp_aln->add_seq($s);\n+             $tmp_aln->add_seq($seq1);\n+\n+             my $identity = $tmp_aln->percentage_identity();\n+             if($identity < $lower_identity) {\n+                 $lower_identity = $identity;\n+                 $lower_identity_seq = $s;\n+             }\n+\n+          }\n+          return $lower_identity_seq;\n+}\n+\n+=head1 COPYRIGHT\n+\n+Copyright (c) 2007: Genome Research Ltd.\n+\n+Authors: Rob Finn (rdf@sanger.ac.uk), John Tate (jt6@sanger.ac.uk),\n+         Jaina Mistry (jm14@sanger.ac.uk)\n+\n+This is free software; you can redistribute it and/or modify it under\n+the terms of the GNU General Public License as published by the Free Software\n+Foundation; either version 2 of the License, or (at your option) any later\n+version.\n+\n+This program is distributed in the hope that it will be useful, but WITHOUT\n+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS\n+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more\n+details.\n+\n+You should have received a copy of the GNU General Public License along with\n+this program. If not, see <http://www.gnu.org/licenses/>.\n+\n+=cut\n+\n+1;\n'
b
diff -r 000000000000 -r 68a3648c7d91 pfamScan/Bio/Pfam/HMM/HMM.pm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pfamScan/Bio/Pfam/HMM/HMM.pm Thu Dec 22 04:45:31 2016 -0500
[
@@ -0,0 +1,266 @@
+# HMM.pm
+#
+# Author:        finnr
+# Maintainer:    $Id: HMM.pm,v 1.1 2009-10-08 12:27:28 jt6 Exp $
+# Version:       $Revision: 1.1 $
+# Created:       Nov 24, 2008
+# Last Modified: $Date: 2009-10-08 12:27:28 $
+=head1 NAME
+
+Template - a short description of the class
+
+=cut
+
+package Bio::Pfam::HMM::HMM;
+
+=head1 DESCRIPTION
+
+A more detailed description of what this class does and how it does it.
+
+$Id: HMM.pm,v 1.1 2009-10-08 12:27:28 jt6 Exp $
+
+=head1 COPYRIGHT
+
+File: HMM.pm
+
+Copyright (c) 2007: Genome Research Ltd.
+
+Authors: Rob Finn (rdf@sanger.ac.uk), John Tate (jt6@sanger.ac.uk)
+
+ This is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public License
+ as published by the Free Software Foundation; either version 2
+ of the License, or (at your option) any later version.

+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.

+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ or see the on-line version at http://www.gnu.org/copyleft/gpl.txt

+=cut
+
+use strict;
+use warnings;
+
+use Moose;
+use Moose::Util::TypeConstraints;
+use Carp;
+
+#-------------------------------------------------------------------------------
+
+=head1 METHODS
+
+=cut
+
+
+subtype 'hmmVersion'
+  => as Str
+  => where { $_ =~ m/^HMMER3\/f\s+\[3\.\d+[ab](\d+)?\s+|\s+\[.*\]/ }
+  => message { "|$_| does not look like as HMMER3 version" };
+
+subtype 'hmmName'
+  => as Str
+  => where { $_ =~ m/\S{1,15}/ }
+  => message { "|$_| does not look like Pfam name or SEED" };
+
+subtype 'hmmAcc'
+  => as Str
+  => where { $_ =~ m/PF\d{5}/ }
+  => message { "|$_| does not look like Pfam accession" };
+
+
+subtype 'hmmAlpha'
+  => as Str
+  => where { $_ eq 'amino' or $_ eq 'nucleic'  }
+  => message { "|$_| does not look like a HMMER3 alphabet" };
+
+
+subtype 'hmmMsvStats'
+  => as HashRef
+  => where { defined ($_->{mu}) and defined ($_->{lambda}) and ($_->{lambda} <= 0.8) and ($_->{lambda} >= 0.5) }
+  => message { "Mu |$_->{mu}| and lambda |$_->{lambda}| must be defined and lambda must be between 0.5 and 0.8" };
+
+  
+subtype 'hmmViterbiStats'
+  => as HashRef
+  => where { defined ($_->{mu}) and defined ($_->{lambda}) and ($_->{lambda} <= 0.8) and ($_->{lambda} >= 0.5) }
+  => message { "Mu |$_->{mu}| and lambda |$_->{lambda}| must be defined and lambda must be between 0.5 and 0.8" };
+
+subtype 'hmmForwardStats'
+  => as HashRef
+  => where { defined ($_->{tau}) and defined ($_->{lambda}) and ($_->{lambda} <= 0.8) and ($_->{lambda} >= 0.5) }
+  => message { "Tau |$_->{tau}| and lambda |$_->{lambda}| must be defined and lambda must be between 0.5 and 0.8" };
+  
+has 'version' => (
+  isa      => 'hmmVersion',
+  is       => 'rw',
+  required => 1
+);
+
+has 'name' => (
+  isa => 'hmmName',
+  is  => 'rw',
+  required => 1
+);
+
+has 'accession' => (
+  isa => 'hmmAcc',
+  is  => 'rw'
+);
+
+has 'description' => (
+  isa => 'Str',
+  is  => 'rw'
+);
+
+has 'length' => (
+  isa  => 'Int',
+  is   => 'rw',
+  required => 1
+);
+
+has 'alpha' => (
+  isa  => 'hmmAlpha',
+  is   => 'rw',
+  required => 1,
+);
+
+has 'rf' => (
+  isa  => 'Bool',
+  is   => 'rw',
+  required => 1
+);
+
+has 'mm' => (
+  isa  => 'Bool',
+  is   => 'rw',
+);
+
+has 'cons' => (
+  isa  => 'Bool',
+  is   => 'rw',
+);
+
+has 'cs' => (
+  isa  => 'Bool',
+  is   => 'rw',
+  required => 1
+);
+
+has 'map' => (
+  isa  => 'Bool',
+  is   => 'rw',
+  required => 1
+);
+
+has 'date' => (
+  isa  => 'Str',
+  is   => 'rw',
+  required => 1
+);
+
+has 'buildLine' => (
+  isa => 'HashRef[Str]',
+  is  => 'rw',
+  required => 1,
+  default => sub { {} },
+);
+
+has 'searchMethod' => (
+  isa => 'Str',
+  is  => 'rw',
+);
+
+has 'nSeq' => (
+  isa  => 'Int',
+  is   => 'rw',
+  required => 1
+);
+
+has 'msvStats' => (
+  isa => 'hmmMsvStats',
+  is => 'rw',
+  required => 1
+);
+
+has 'viterbiStats' => (
+  isa => 'hmmViterbiStats',
+  is => 'rw',
+  required => 1
+);
+
+has 'forwardStats' => (
+  isa => 'hmmForwardStats',
+  is => 'rw',
+  required => 1
+);
+
+
+has 'effn' => (
+  isa  => 'Num',
+  is   => 'rw',
+  required => 1
+);
+
+has 'cksum' => (
+  isa => 'Int',
+  is  => 'rw',
+  required => 1
+);
+
+has 'seqGA' => (
+  isa => 'Num',
+  is  => 'rw',
+);
+
+has 'domGA' => (
+  isa => 'Num',
+  is  => 'rw',
+);
+
+has 'seqTC' => (
+  isa => 'Num',
+  is  => 'rw',
+);
+
+has 'domTC' => (
+  isa => 'Num',
+  is  => 'rw',
+);
+
+has 'seqNC' => (
+  isa => 'Num',
+  is  => 'rw',
+);
+
+has 'domNC' => (
+  isa => 'Num',
+  is  => 'rw',
+);
+
+has 'emissionLines' => (
+  isa => 'ArrayRef[ArrayRef]',
+  is  => 'rw',
+  default => sub { [] },
+);
+
+has 'mapPos'=> (
+  isa => 'ArrayRef[Int]',
+  is  => 'rw',
+  default => sub{ [] }
+);
+
+has 'compLines' => (
+  isa => 'ArrayRef[Str]',
+  is  => 'rw',
+  default => sub { [] },
+);
+
+__PACKAGE__->meta->make_immutable;
+1;
+
b
diff -r 000000000000 -r 68a3648c7d91 pfamScan/Bio/Pfam/HMM/HMMIO.pm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pfamScan/Bio/Pfam/HMM/HMMIO.pm Thu Dec 22 04:45:31 2016 -0500
[
b'@@ -0,0 +1,329 @@\n+# HMM.pm\n+#\n+# Author:        finnr\n+# Maintainer:    $Id: HMMIO.pm,v 1.3 2010-01-12 17:00:26 jm14 Exp $\n+# Version:       $Revision: 1.3 $\n+# Created:       Nov 24, 2008\n+# Last Modified: $Date: 2010-01-12 17:00:26 $\n+=head1 NAME\n+\n+Template - a short description of the class\n+\n+=cut\n+\n+package Bio::Pfam::HMM::HMMIO;\n+\n+=head1 DESCRIPTION\n+\n+A more detailed description of what this class does and how it does it.\n+\n+$Id: HMMIO.pm,v 1.3 2010-01-12 17:00:26 jm14 Exp $\n+\n+=head1 COPYRIGHT\n+\n+File: HMM.pm\n+\n+Copyright (c) 2007: Genome Research Ltd.\n+\n+Authors: Rob Finn (rdf@sanger.ac.uk), John Tate (jt6@sanger.ac.uk)\n+\n+ This is free software; you can redistribute it and/or\n+ modify it under the terms of the GNU General Public License\n+ as published by the Free Software Foundation; either version 2\n+ of the License, or (at your option) any later version.\n+ \n+ This program is distributed in the hope that it will be useful,\n+ but WITHOUT ANY WARRANTY; without even the implied warranty of\n+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n+ GNU General Public License for more details.\n+ \n+ You should have received a copy of the GNU General Public License\n+ along with this program; if not, write to the Free Software\n+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.\n+ or see the on-line version at http://www.gnu.org/copyleft/gpl.txt\n+ \n+=cut\n+\n+use strict;\n+use warnings;\n+\n+use Moose;\n+use Moose::Util::TypeConstraints;\n+use Carp;\n+use Bio::Pfam::HMM::HMM;\n+\n+#-------------------------------------------------------------------------------\n+\n+=head1 METHODS\n+\n+=cut\n+sub readHMM {\n+  my ($this, $hmm) = @_;\n+  \n+  unless($hmm){\n+    confess("No HMM passed in!"); \n+  }\n+  chomp($hmm);\n+  \n+  my @input;\n+  if(ref($hmm) eq \'GLOB\'){\n+    @input = <$hmm>;\n+  }elsif($hmm !~ /\\n/ and -e $hmm and -s $hmm){\n+    #Assume that we have a filename and try and open it;\n+    open(HMM, $hmm) || confess("Could not open $hmm:[$!]");\n+    @input = <HMM>;\n+  }else{\n+    @input = split(/\\n/, $hmm); \n+  }\n+  \n+  \n+  \n+  #Parse the header section!  \n+  #HMMER3/f [3.1b1 | May 2013]\n+  #NAME  SEED\n+  #ACC   PF000001.1\n+  #DESC  A description\n+  #LENG  55\n+  #ALPH  amino\n+  #RF    no\n+  #MM    no\n+  #CONS  yes\n+  #CS    no\n+  #MAP   yes\n+  #DATE  Fri Nov 21 09:58:16 2008\n+  #COM   [1] /Users/finnr/Work/Software/hmmer-3.0.20081101/bin/hmmbuild -o hmmbuild.log HMM SEED\n+  #NSEQ  279\n+  #EFFN  4.966292\n+  #STATS LOCAL MSV      -11.4716  0.69948\n+  #STATS LOCAL VITERBI  -12.3713  0.69948\n+  #STATS LOCAL FORWARD   -5.5807  0.69948\n+\n+  #To add GA, TC, NC, CKSUM, DESC\n+  my($objHash);\n+  my $i =0; \n+  foreach ( @input ){  \n+    if(my ($version) = $_ =~ /(HMMER3.*)/){\n+      $objHash->{version} = $version;\n+    }elsif(my ($acc) = $_ =~ /^ACC\\s+(PF\\d+\\.\\d+)$/){\n+      $objHash->{accession} = $acc;\n+    }elsif(/NAME\\s+(\\S+)/){ \n+      $objHash->{name} =  $1 ;\n+    }elsif(/DESC\\s+(.*)/){ \n+      $objHash->{description} =   $1 ;\n+    }elsif(my ($length) = $_ =~ /^LENG\\s+(\\d+)/){\n+      $objHash->{length} = $length;\n+    }elsif( my ($alpha) = $_ =~ /^ALPH\\s+(\\S+)/){\n+      $objHash->{alpha} = $alpha;\n+    }elsif( my ($rf) = $_ =~ /^RF\\s+(no|yes)/){\n+      $objHash->{rf} = ($rf eq "no") ? 0 : 1; \n+    }elsif( my ($mm) = $_ =~ /^MM\\s+(no|yes)/){\n+      $objHash->{mm} = ($mm eq "no") ? 0 : 1; \n+    }elsif( my ($cons) = $_ =~ /^CONS\\s+(no|yes)/){\n+      $objHash->{cons} = ($cons eq "no") ? 0 : 1; \n+    }elsif(my ($cs) = $_ =~ /^CS\\s+(no|yes)/ ){\n+      $objHash->{cs} =  ($cs eq "no") ? 0 : 1; \n+    }elsif(my ($map) = $_ =~ /^MAP\\s+(no|yes)/){\n+      $objHash->{map} = ($map eq "no") ? 0 : 1; \n+    }elsif(my ($date) = $_ =~ /^DATE\\s+(.*)/){\n+      $objHash->{date} =  $date; \n+    }elsif(my ($sm) = $_ =~ /^SM\\s+(.*)/){\n+      $objHash->{searchMethod} =  $sm; \n+    \n+    }elsif(my ($options, $hmmName, $alignName) = $input[$i] =~ /^BM.*hmmbuild(.*)? (\\S+) (\\S+)$/){\n+      $objHash->{buildLine} = { cmd     => \'hmmbuild\', \n+   '..b' #No veryifiy that we have COMP line and the the number of emissionlines is equivalent to length \n+  unless(scalar( @{ $hmmObj->emissionLines } ) == $hmmObj->length){\n+    confess("Number of emssionLines does not match the length of the model, got ".scalar( @{ $hmmObj->emissionLines} ).\n+        " expected ".$hmmObj->length);  \n+  }\n+  \n+  unless($hmmObj->compLines){\n+    confess("No compLine set on HMM"); \n+  }\n+  \n+  if($hmmObj->map){\n+    unless(scalar(@{$hmmObj->mapPos}) == $hmmObj->length ){\n+      confess("HMM object had map set, but the number of map positions does not match the length of the HMM");  \n+    }; \n+  }\n+  return $hmmObj;  \n+}\n+\n+\n+\n+sub writeHMM {\n+  my ($this, $hmm, $hmmObj) = @_;\n+  \n+  unless($hmm){\n+    confess("No HMM out file passed in!"); \n+  }\n+  \n+  unless(ref($hmm) eq \'GLOB\'){\n+    my $hmmFile = $hmm;\n+    $hmm = undef;\n+    #Assume that we have a filename and try and open it;\n+    open($hmm, ">$hmmFile") || confess("Could not open $hmmFile:[$!]");\n+  }\n+\n+  print  $hmm $hmmObj->version."\\n";\n+  printf $hmm ("%-5s %s\\n", "NAME", $hmmObj->name);\n+  printf $hmm ("%-5s %s\\n", "ACC",  $hmmObj->accession) if($hmmObj->accession);\n+  printf $hmm ("%-5s %s\\n", "DESC", $hmmObj->description) if($hmmObj->description);\n+  printf $hmm ("%-5s %d\\n", "LENG", $hmmObj->length);\n+  printf $hmm ("%-5s %s\\n", "ALPH", $hmmObj->alpha);\n+  printf $hmm ("%-5s %s\\n", "RF", ($hmmObj->rf ? "yes" : "no"));\n+  printf $hmm ("%-5s %s\\n", "MM", ($hmmObj->mm ? "yes" : "no"));\n+  printf $hmm ("%-5s %s\\n", "CONS", ($hmmObj->cons ? "yes" : "no"));\n+  printf $hmm ("%-5s %s\\n", "CS", ($hmmObj->cs ? "yes" : "no"));\n+  printf $hmm ("%-5s %s\\n", "MAP", ($hmmObj->map ? "yes" : "no"));\n+  printf $hmm ("%-5s %s\\n", "DATE", $hmmObj->date);\n+  printf $hmm ("%-5s %d\\n", "NSEQ", $hmmObj->nSeq);\n+  printf $hmm ("%-5s %f\\n", "EFFN", $hmmObj->effn);\n+  printf $hmm ("%-5s %d\\n", "CKSUM", $hmmObj->cksum);\n+  printf $hmm ("%-5s %.2f %.2f;\\n", "GA", $hmmObj->seqGA, $hmmObj->domGA) if(defined($hmmObj->seqGA));\n+  printf $hmm ("%-5s %.2f %.2f;\\n", "TC", $hmmObj->seqTC, $hmmObj->domTC) if(defined($hmmObj->seqTC));\n+  printf $hmm ("%-5s %.2f %.2f;\\n", "NC", $hmmObj->seqNC, $hmmObj->domNC) if(defined($hmmObj->seqNC));\n+  \n+  printf $hmm ("%-5s %s %-9s %.4f  %.5f\\n", "STATS", "LOCAL", "MSV", $hmmObj->msvStats->{mu},  $hmmObj->msvStats->{lambda});\n+  printf $hmm ("%-5s %s %-9s %.4f  %.5f\\n", "STATS", "LOCAL", "VITERBI", $hmmObj->viterbiStats->{mu},  $hmmObj->viterbiStats->{lambda});\n+  printf $hmm ("%-5s %s %-9s %.4f  %.5f\\n", "STATS", "LOCAL", "FORWARD", $hmmObj->forwardStats->{tau},  $hmmObj->forwardStats->{lambda});\n+  \n+  print $hmm <<EOF;\n+HMM          A        C        D        E        F        G        H        I        K        L        M        N        P        Q        R        S        T        V        W        Y   \n+            m->m     m->i     m->d     i->m     i->i     d->m     d->d\n+EOF\n+  \n+  printf $hmm ("%7s ", "COMPO");\n+   foreach my $s (@{$hmmObj->compLines->[0]}){\n+       printf $hmm ("  %7s", $s);\n+    }\n+    print $hmm "\\n";\n+     \n+    print $hmm (" " x 8);\n+    foreach my $s (@{$hmmObj->compLines->[1]}){\n+       printf $hmm ("  %7s", $s);\n+    }\n+    print $hmm "\\n";\n+    \n+    print $hmm (" " x 8);\n+    foreach my $s (@{$hmmObj->compLines->[2]}){\n+       printf $hmm ("  %7s", $s);\n+    }\n+    print $hmm "\\n";\n+    \n+  \n+  my $pos = 1;\n+  foreach my $el (@{ $hmmObj->emissionLines }){\n+    printf $hmm ("%7s ", $pos);\n+    foreach my $s (@{$el->[0]}){\n+       printf $hmm ("  %7s", $s);\n+    }\n+    if($hmmObj->map){\n+      printf $hmm ("%7s - -\\n", $hmmObj->mapPos->[$pos - 1]);  \n+    }else{\n+      print $hmm "\\n";\n+    } \n+    print $hmm (" " x 8);\n+    foreach my $s (@{$el->[1]}){\n+       printf $hmm ("  %7s", $s);\n+    }\n+    print $hmm "\\n";\n+    print $hmm (" " x 8);\n+    foreach my $s (@{$el->[2]}){\n+       printf $hmm ("  %7s", $s);\n+    }\n+    print $hmm "\\n"; \n+    $pos++; \n+  }\n+  \n+\n+  print $hmm "//\\n";\n+}\n+\n+\n+\n+1;\n+\n'
b
diff -r 000000000000 -r 68a3648c7d91 pfamScan/Bio/Pfam/HMM/HMMMatch.pm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pfamScan/Bio/Pfam/HMM/HMMMatch.pm Thu Dec 22 04:45:31 2016 -0500
[
@@ -0,0 +1,62 @@
+
+package Bio::Pfam::HMM::HMMMatch;
+
+use strict;
+use warnings;
+
+use Moose;
+use Moose::Util::TypeConstraints;
+
+
+subtype 'evalue'
+  => as Str
+  => where { $_ =~ m/^(\d+(\.\d+){0,1}e[+|-]\d+|\d+\.\d+|\d+)$/ }
+  => message { "$_ does not look like an evalue" };
+
+has 'evalue' => (
+  isa       => 'evalue', 
+  is        => 'rw',
+  required  => 1
+);
+
+has 'bits' => (
+  isa => 'Str',
+  is  => 'rw',
+  required => 1
+);
+
+has 'name' => (
+  isa => 'Str',
+  is  => 'rw',
+  required => 1
+);
+
+has bias => (
+  isa => 'Num',
+  is  => 'rw'
+);
+
+  __PACKAGE__->meta->make_immutable;
+
+=head1 COPYRIGHT
+
+Copyright (c) 2007: Genome Research Ltd.
+
+Authors: Rob Finn (rdf@sanger.ac.uk), John Tate (jt6@sanger.ac.uk)
+
+This is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; either version 2 of the License, or (at your option) any later
+version.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+details.
+
+You should have received a copy of the GNU General Public License along with
+this program. If not, see <http://www.gnu.org/licenses/>.
+
+=cut
+
+1;
b
diff -r 000000000000 -r 68a3648c7d91 pfamScan/Bio/Pfam/HMM/HMMResults.pm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pfamScan/Bio/Pfam/HMM/HMMResults.pm Thu Dec 22 04:45:31 2016 -0500
[
b'@@ -0,0 +1,582 @@\n+# Bio::Pfam::HMM::HMMResults.pm\n+#\n+# Author:        finnr\n+# Maintainer:    $Id: HMMResults.pm,v 1.3 2009-12-15 14:38:08 jt6 Exp $\n+# Version:       $Revision: 1.3 $\n+# Created:       Nov 19, 2008\n+# Last Modified: $Date: 2009-12-15 14:38:08 $\n+\n+=head1 NAME\n+\n+Bio::Pfam::HMM::HMMResults - A object to represents the results from hmmsearch\n+\n+=cut\n+\n+package Bio::Pfam::HMM::HMMResults;\n+\n+=head1 DESCRIPTION\n+\n+A more detailed description of what this class does and how it does it.\n+\n+$Id: HMMResults.pm,v 1.3 2009-12-15 14:38:08 jt6 Exp $\n+\n+=head1 COPYRIGHT\n+\n+File: Bio::Pfam::HMM::HMMResults.pm\n+\n+Copyright (c) 2007: Genome Research Ltd.\n+\n+Authors: Rob Finn (rdf@sanger.ac.uk)\n+\n+ This is free software; you can redistribute it and/or\n+ modify it under the terms of the GNU General Public License\n+ as published by the Free Software Foundation; either version 2\n+ of the License, or (at your option) any later version.\n+ \n+ This program is distributed in the hope that it will be useful,\n+ but WITHOUT ANY WARRANTY; without even the implied warranty of\n+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n+ GNU General Public License for more details.\n+ \n+ You should have received a copy of the GNU General Public License\n+ along with this program; if not, write to the Free Software\n+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.\n+ or see the on-line version at http://www.gnu.org/copyleft/gpl.txt\n+ \n+=cut\n+\n+use strict;\n+use warnings;\n+\n+use Moose;\n+use Moose::Util::TypeConstraints;\n+use Bio::Pfam::HMM::HMMSequence;\n+use Bio::Pfam::HMM::HMMUnit;\n+\n+#\n+#-------------------------------------------------------------------------------\n+# Attributes\n+\n+has \'hmmerVersion\' => (\n+  isa       => \'Str\', \n+  is        => \'rw\', \n+);\n+\n+has \'hmmName\' => (\n+  isa => \'Str\',\n+  is  => \'rw\'\n+);\n+\n+has \'seqDB\' => (\n+  isa => \'Str\',\n+  is  => \'rw\'\n+);\n+\n+has hmmLength => (\n+  isa => \'Int\',\n+  is  => \'rw\'\n+);\n+\n+has \'thisFile\' => (\n+  isa => \'Str\',\n+  is  => \'rw\'\n+);\n+\n+has seedName => (\n+  isa => \'Str\',\n+  is  => \'rw\'\n+);\n+\n+has \'seqs\' => (\n+  isa => \'HashRef\',\n+  is  => \'rw\',\n+  default => sub { {} },\n+);\n+\n+has \'units\' => (\n+  isa => \'ArrayRef\',\n+  is  => \'rw\',\n+  default => sub { [] },\n+);\n+\n+has \'domThr\' => (\n+  isa => \'Num\',\n+  is   => \'rw\',\n+  default => \'25.0\'\n+);\n+\n+has \'seqThr\' => (\n+  isa => \'Num\',\n+  is  => \'rw\',\n+  default => \'25.0\'\n+);\n+\n+has \'evalueThr\' => (\n+  isa => \'Num\',\n+  is  => \'rw\'\n+);\n+\n+has \'domTC\' => (\n+  isa => \'Num\',\n+  is  => \'rw\'\n+);\n+\n+has \'seqTC\' => (\n+  isa => \'Num\',\n+  is  => \'rw\'\n+);\n+\n+has \'domNC\' => (\n+  isa => \'Num\',\n+  is  => \'rw\'\n+);\n+\n+has \'seqNC\' => (\n+  isa => \'Num\',\n+  is  => \'rw\'\n+);\n+\n+has \'randSeedNum\' => (\n+  isa => \'Int\',\n+  is  => \'rw\'\n+);\n+\n+has \'description\' => (\n+  isa => \'Str\',\n+  is  => \'rw\'\n+);\n+\n+has \'seqName\' => (\n+  isa => \'Str\',\n+  is  => \'rw\'\n+);\n+\n+has \'seqLength\' => (\n+  isa => \'Int\',\n+  is  => \'rw\'\n+);\n+\n+\n+has \'eof\' => (\n+  isa => \'Int\',\n+  is  => \'rw\',\n+  default => 0\n+);\n+\n+has \'program\' => (\n+  isa => \'Str\',\n+  is  => \'rw\'\n+);\n+\n+=head1 METHODS\n+\n+=head2 addHMMSeq \n+\n+  Title    : addHMMSeq\n+  Usage    : $hmmRes->addHMMSeq( $hmmSeqObj )\n+  Function : Adds a Bio::Pfam::HMM::HMMSequence object to the results object \n+  Args     : A Bio::Pfam::HMM::HMMSequence object\n+  Returns  : nothing\n+  \n+=cut\n+\n+sub addHMMSeq {\n+  my( $self, $hmmSeq ) = @_;\n+\n+  unless($hmmSeq->isa(\'Bio::Pfam::HMM::HMMSequence\')){\n+    die \'Trying to add a non Bio::Pfam::HMM::HMMSequence object\';  \n+  }\n+\n+  if($self->seqs){\n+    if($self->seqs->{$hmmSeq->name}){\n+      die "Trying to add the same sequence twice"; \n+    }\n+  }\n+  \n+  $self->seqs->{$hmmSeq->name} = $hmmSeq;\n+}\n+\n+\n+=head2 eachHMMSeq \n+\n+  Title    : eachHMMSeq\n+  Usage    : my @seqs = $hmmRes->eachHMMSeq\n+  Function : Returns an array reference containing the references to all of the\n+           : Bio::Pfam::HMM::HMMSequence objects stored in the HMMResults object.\n+  Args     : None\n+  Return'..b'ing ED line for invalid hmm unit: " . $e->{seq}."/".$e->{oldFrom}."-".$e->{oldTo}. "\\n";\n+\t}\n+\telse {\n+\t  warn $e->{seq}."/".$e->{oldFrom}."-".$e->{oldTo}." does not appear in the list of hmm units - bad ED line\\n";\n+\t}\n+      }\n+    }else{ #Sequence not found - bad ED\n+      if($removeBadEd) {\n+\tprint "Removing ED line for invalid hmm unit: " . $e->{seq}."/".$e->{oldFrom}."-".$e->{oldTo}. "\\n";\n+      }\n+      else {\n+\twarn $e->{seq}." does not appear in the list of hmm units - bad ED line\\n";  \n+      }\n+    }\n+  }\n+  return(\\@validEd) if($removeBadEd);\n+  \n+}\n+\n+sub remove_overlaps_by_clan {\n+\n+    my ($self, $clanmap, $nested) = @_;\n+\n+    my $new = Bio::Pfam::HMM::HMMResults->new;\n+    $new->seqName($self->seqName);\n+   \n+    foreach my $unit ( sort { $a->evalue <=> $b->evalue }  @{ $self->units } ) {\n+\n+        #check if it overlaps before adding\n+\tmy $o;\n+\t\n+\tforeach my $u ( @{ $new->units } ) {\n+\t    \n+\t    if( exists($clanmap->{$unit->name}) and exists($clanmap->{$u->name}) and ($clanmap->{$unit->name} eq $clanmap->{$u->name}) ) {\n+\t\tif( overlap( $unit, $u ) ) {\n+\t\t    if(exists($$nested{$unit->name}{$u->name})) {\n+\t\t\tnext;\n+\t\t    }\n+\t\t    else {\n+\t\t\t$o=1;\n+\t\t\tlast;\n+\t\t    }\n+\t\t}\n+\t\t\n+\t    }\n+\t}\n+\tunless($o) {\n+\t    if(! $new->seqs->{$unit->name}) {\n+\t\t\n+\t\t$new->addHMMSeq( Bio::Pfam::HMM::HMMSequence->new( {  name       => $self->seqs->{$unit->name}->name,\n+\t\t\t\t\t\t\t\t      desc       => $self->seqs->{$unit->name}->desc,\n+\t\t\t\t\t\t\t\t      bits       => $self->seqs->{$unit->name}->bits,\n+\t\t\t\t\t\t\t\t      evalue     => $self->seqs->{$unit->name}->evalue,\n+\t\t\t\t\t\t\t\t      numberHits => $self->seqs->{$unit->name}->numberHits}) );\n+\t    \n+\t    }\n+\t    $new->addHMMUnit($unit);\n+\t}\n+\n+    }\n+    return $new;\n+}\n+\n+\n+\n+sub overlap {\n+    # does unit1 overlap with unit2?\n+    my $unit1 = shift;\n+    my $unit2 = shift;\n+    my( $u1, $u2 ) = sort { $a->seqFrom <=> $b->seqFrom } ( $unit1, $unit2 );\n+\n+\n+    if( $u2->seqFrom <= $u1->seqTo ) {\n+        return 1;\n+    } \n+\n+    return 0;\n+}\n+\n+\n+sub results {\n+  my ( $self, $pfamScanData, $e_value ) = @_;\n+\n+  my @results = ();\n+  foreach my $unit ( sort { $a->seqFrom <=> $b->seqFrom } @{ $self->units } ) {    \n+\n+    my $pfamB = $unit->name =~ /^Pfam-B/;\n+\n+    #Filter results based on thresholds\n+    if ( $unit->name =~ /^Pfam-B/ ) {\n+      next unless ( $self->seqs->{$unit->name}->evalue <= 0.001 and $unit->evalue <= 0.001 );\n+      $pfamB = 1;\n+    }\n+    else {\t  \n+      if ( $e_value ) {\n+        next unless ( $self->seqs->{$unit->name}->evalue <= $e_value and $unit->evalue <= $e_value ) ;\n+      } \n+      else {\n+       next unless $unit->sig;\n+      }\n+    }\n+\n+    push @results, {\n+      seq          => { from => $unit->seqFrom,\n+                        to   => $unit->seqTo,\n+                        name => $self->seqName },\n+      env          => { from => $unit->envFrom,\n+                        to   => $unit->envTo },\n+\n+      hmm          => { from => $unit->hmmFrom,\n+                        to   => $unit->hmmTo },\n+\n+      model_length => $pfamScanData->{_model_len}->{ $unit->name },\n+      bits         => $unit->bits,\n+      evalue       => $unit->evalue,\n+      acc          => $pfamScanData->{_accmap}->{ $unit->name },\n+      name         => $unit->name,\n+      desc         => $pfamScanData->{_desc}->{ $unit->name },\n+      type         => $pfamB ? undef : $pfamScanData->{_type}->{ $unit->name },\n+      clan         => $pfamB ? undef : \n+                         $pfamScanData->{_clanmap}->{ $unit->name } || \'No_clan\',\n+\n+      act_site     => $pfamB ? undef : $unit->{act_site},\n+      sig          => $pfamB ? "NA" : $unit->sig,\n+      align        => [ sprintf( \'#HMM       %s\', $unit->hmmalign->{hmm} ),\n+                        sprintf( \'#MATCH     %s\', $unit->hmmalign->{match} ),\n+                        sprintf( \'#PP        %s\', $unit->hmmalign->{pp} ),\n+                        sprintf( \'#SEQ       %s\', $unit->hmmalign->{seq} ) ]\n+    };\n+  }\n+\n+  return \\@results;\n+}\n+\n+1;\n'
b
diff -r 000000000000 -r 68a3648c7d91 pfamScan/Bio/Pfam/HMM/HMMResultsIO.pm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pfamScan/Bio/Pfam/HMM/HMMResultsIO.pm Thu Dec 22 04:45:31 2016 -0500
[
b'@@ -0,0 +1,1186 @@\n+# HMMResultsIO.pm\n+#\n+# Author:        rdf\n+# Maintainer:    $Id: HMMResultsIO.pm,v 1.2 2009-12-01 15:42:20 jt6 Exp $\n+# Version:       $Revision: 1.2 $\n+# Created:       Nov 16, 2008\n+# Last Modified: $Date: 2009-12-01 15:42:20 $\n+\n+=head1 NAME\n+\n+Template - a short description of the class\n+\n+=cut\n+\n+package Bio::Pfam::HMM::HMMResultsIO;\n+\n+=head1 DESCRIPTION\n+\n+A more detailed description of what this class does and how it does it.\n+\n+$Id: HMMResultsIO.pm,v 1.2 2009-12-01 15:42:20 jt6 Exp $\n+\n+=head1 COPYRIGHT\n+\n+File: HMMResultsIO.pm\n+\n+Copyright (c) 2007: Genome Research Ltd.\n+\n+Authors: Rob Finn (rdf@sanger.ac.uk), John Tate (jt6@sanger.ac.uk)\n+\n+ This is free software; you can redistribute it and/or\n+ modify it under the terms of the GNU General Public License\n+ as published by the Free Software Foundation; either version 2\n+ of the License, or (at your option) any later version.\n+ \n+ This program is distributed in the hope that it will be useful,\n+ but WITHOUT ANY WARRANTY; without even the implied warranty of\n+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n+ GNU General Public License for more details.\n+ \n+ You should have received a copy of the GNU General Public License\n+ along with this program; if not, write to the Free Software\n+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.\n+ or see the on-line version at http://www.gnu.org/copyleft/gpl.txt\n+ \n+=cut\n+\n+use strict;\n+use warnings;\n+use Moose;\n+use Carp;\n+\n+#All the things we need to objectfy the search results\n+use Bio::Pfam::HMM::HMMResults;\n+use Bio::Pfam::HMM::HMMSequence;\n+use Bio::Pfam::HMM::HMMUnit;\n+\n+#-------------------------------------------------------------------------------\n+\n+=head1 ATTRIBUTES\n+\n+\n+\n+=cut\n+\n+has \'align\' => (\n+  isa     => \'Int\',\n+  is      => \'rw\',\n+  default => 0\n+);\n+\n+has \'outfile\' => (\n+  isa     => \'Str\',\n+  is      => \'rw\',\n+  default => \'OUTPUT\'\n+);\n+\n+has \'pfamout\' => (\n+  isa     => \'Str\',\n+  is      => \'rw\',\n+  default => \'PFAMOUT\'\n+);\n+\n+has \'scores\' => (\n+  isa     => \'Str\',\n+  is      => \'rw\',\n+  default => \'scores\'\n+);\n+\n+#-------------------------------------------------------------------------------\n+\n+=head1 METHODS\n+\n+=head2 parseHMMER3\n+\n+  Title    : parseHMMER \n+  Usage    : $hmmResIO->parseHMMSearch( filename )\n+  Function : Parse the output from a HMMER3 search results \n+  Args     : Filename containing the search \n+  Returns  : A Bio::Pfam::HMM::HMMResults object\n+  \n+=cut\n+\n+sub parseHMMER3 {\n+  my ( $self, $filename ) = @_;\n+  my $fh;\n+ \n+  if(ref($filename) eq \'GLOB\'){\n+    $fh = $filename;\n+  }else{\n+    open( $fh, $filename ) or confess "Could not open $filename:[$!]\\n";\n+  }\n+  \n+#  open( $fh, $filename ) or confess "Could not open $filename:[$!]\\n";\n+  my $hmmRes = Bio::Pfam::HMM::HMMResults->new;\n+  $self->_readHeader( $fh, $hmmRes );\n+  $self->_readSeqHits( $fh, $hmmRes );\n+  $self->_readUnitHits( $fh, $hmmRes );\n+  $self->_readFooter($fh, $hmmRes);\n+  return ($hmmRes);\n+}\n+\n+\n+\n+sub parseMultiHMMER3 {\n+  my ( $self, $filename ) = @_;\n+  my $fh;\n+  \n+  if(ref($filename) eq \'GLOB\'){\n+    $fh = $filename;\n+  }elsif( ref($filename) and $filename->isa(\'IO::File\') ) {\n+      $fh = $filename;\n+  }else{\n+    open( $fh, $filename ) or confess "Could not open $filename:[$!]\\n";\n+  }  \n+  \n+  my @hmmResAll;\n+  my $program;\n+  while(!eof($fh)){\n+    my $hmmRes = Bio::Pfam::HMM::HMMResults->new;\n+    my $eof = $self->_readHeader( $fh, $hmmRes ); \n+    last if($eof);\n+    push(@hmmResAll, $hmmRes);\n+    if($hmmRes->program) {\n+\t$program = $hmmRes->program;\n+    }\n+    else {\n+\t$hmmRes->program($program);\n+    }\n+    $self->_readSeqHits( $fh, $hmmRes );\n+    $self->_readUnitHits( $fh, $hmmRes );\n+    $self->_readFooter($fh, $hmmRes);\n+  }\n+  return (\\@hmmResAll);\n+}\n+\n+sub parseSplitHMMER3 {\n+  my($self, $files ) = @_;\n+  \n+  my $hmmRes = Bio::Pfam::HMM::HMMResults->new;\n+  \n+  foreach my $filename (@{$files}){\n+    my ($fh);\n+    open( $fh, $filen'..b'        828.85\n+# CPU time: 115.36u 4.45s 00:01:59.81 Elapsed: 00:03:01\n+\n+#sub writeHMMSearch {\n+#  my ( $self, $hmmRes ) = @_;\n+#  my $fh;\n+#  open($fh, ">".$self->outfile."\\n");\n+#\n+#  $self->_writeHeader($fh, $hmmRes);\n+#  $self->_writeSeqHits( $fh, $hmmRes);\n+#  $self->_writeDomHits( $fh, $hmmRes);\n+#  $self->_writeAlign( $fh, $hmmRes) if($self->align);\n+#  $self->_writeInternalSummary( $fh, $hmmRes);\n+#}\n+#sub mergeHMMSearch {\n+#  my ( $self, $filenames ) = @_;\n+#}\n+\n+\n+sub write_ascii_out {\n+\n+    my ($self, $HMMResults, $fh, $scanData, $e_seq, $e_dom, $b_seq, $b_dom) = @_;\n+\n+\n+    $scanData->{_max_seqname} = 20 unless($scanData->{_max_seqname} or $scanData->{_max_seqname} < 1);\n+    \n+    my $ga;\n+\n+    if($e_seq or $e_dom) {\n+\t$e_seq = $e_dom unless($e_seq);\n+\t$e_dom = "10" unless($e_dom);\n+    } \n+    elsif($b_seq or $b_dom) {\n+\t$b_seq = $b_dom unless($b_seq);\n+\t$b_dom = "0" unless($b_dom);\n+    }\n+    else {\n+\t$ga = 1;\n+    }\n+\n+\n+    foreach my $unit ( sort { $a->seqFrom <=> $b->seqFrom } @{ $HMMResults->units } ) {    \n+\n+        if($unit->name =~ /Pfam\\-B/) {\n+\n+\t    next unless($HMMResults->seqs->{$unit->name}->evalue <= "0.001" and $unit->evalue <= "0.001");\n+\n+\n+\t    printf $fh "%-".$scanData->{_max_seqname}."s %6d %6d %6d %6d %-11s %-16s %7s %5d %5d %5d %8s %9s %3s %-8s\\n",\n+\t    $HMMResults->seqName,\n+\t    $unit->seqFrom,\n+\t    $unit->seqTo,\n+\t    $unit->envFrom,\n+\t    $unit->envTo,\n+\t    $scanData->{_accmap}->{ $unit->name },\n+\t    $unit->name,\n+\t    "Pfam-B",\n+\t    $unit->hmmFrom,\n+\t    $unit->hmmTo,\n+\t    $scanData->{_model_len}->{ $unit->name },\n+\t    $unit->bits,\n+\t    $unit->evalue,\n+\t    "NA",\n+\t    "NA";\n+\n+\n+\t}\n+\telse {\n+\n+            #Filter results based on thresholds\n+\t    if($ga) {\n+\t\tnext unless($unit->sig);\n+\t    }\n+\t    if($e_seq) {\n+\t\tnext unless($HMMResults->seqs->{$unit->name}->evalue <= $e_seq and $unit->evalue <= $e_dom);\n+\t    }\n+\t    if($b_seq) {\n+\t\t\n+\t\tnext unless($HMMResults->seqs->{$unit->name}->bits >= $b_seq and $unit->bits >= $b_dom);\n+\t    }\n+\t    \n+\t    my $clan = $scanData->{_clanmap}->{ $unit->name } || "No_clan";\n+\t    \n+\t    \n+\t    printf $fh "%-".$scanData->{_max_seqname}."s %6d %6d %6d %6d %-11s %-16s %7s %5d %5d %5d %8s %9s %3d %-8s ",\n+\t    $HMMResults->seqName,\n+\t    $unit->seqFrom,\n+\t    $unit->seqTo,\n+\t    $unit->envFrom,\n+\t    $unit->envTo,\n+\t    $scanData->{_accmap}->{ $unit->name },\n+\t    $unit->name,\n+\t    $scanData->{_type}->{ $unit->name },\n+\t    $unit->hmmFrom,\n+\t    $unit->hmmTo,\n+\t    $scanData->{_model_len}->{ $unit->name },\n+\t    $unit->bits,\n+\t    $unit->evalue,\n+\t    $unit->sig, \n+\t    $clan;\n+\t\n+\t    \n+\t    if($unit->{\'act_site\'}) {\n+\t\tlocal $" = ",";\n+\t\tprint $fh "predicted_active_site[@{$unit->{\'act_site\'}}]";\n+\t    }\n+\t\n+\t    if($scanData->{_translate}){\n+\t      my $strand = \'?\';\n+\t      my $start = \'-\';\n+\t      my $end   = \'-\';\n+\t      if(exists($scanData->{_orf}->{$HMMResults->seqName})){\n+\t       $strand = $scanData->{_orf}->{$HMMResults->seqName}->{strand};  \n+\t       if($strand eq \'+\'){\n+\t         $start = $scanData->{_orf}->{$HMMResults->seqName}->{start} + ($unit->envFrom * 3) - 3;\n+\t         $end = $scanData->{_orf}->{$HMMResults->seqName}->{start} + ($unit->envTo * 3) - 3;\n+\t       }elsif($strand eq \'-\'){\n+\t         $start = $scanData->{_orf}->{$HMMResults->seqName}->{start} - ($unit->envFrom * 3) + 3;\n+           $end = $scanData->{_orf}->{$HMMResults->seqName}->{start} - ($unit->envTo * 3) + 3;\n+\t       }\n+\t      }\n+\t      print $fh "$strand $start $end";\n+\t    }\n+\t\n+\t    print $fh "\\n";\n+\t}\n+\n+\tif($scanData->{_align}){\n+\t    print $fh sprintf( "%-10s %s\\n", "#HMM",   $unit->hmmalign->{hmm} );\n+\t    print $fh sprintf( "%-10s %s\\n", "#MATCH", $unit->hmmalign->{match} );\n+\t    print $fh sprintf( "%-10s %s\\n", "#PP",   $unit->hmmalign->{pp});\n+\t    print $fh sprintf( "%-10s %s\\n", "#SEQ",   $unit->hmmalign->{seq});\n+\t    print $fh sprintf( "%-10s %s\\n", "#CS",   $unit->hmmalign->{cs}) if($unit->hmmalign->{cs});\n+\t}\n+\t\n+    }\n+    \n+}\n+\n+1;\n'
b
diff -r 000000000000 -r 68a3648c7d91 pfamScan/Bio/Pfam/HMM/HMMSequence.pm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pfamScan/Bio/Pfam/HMM/HMMSequence.pm Thu Dec 22 04:45:31 2016 -0500
[
@@ -0,0 +1,102 @@
+
+package Bio::Pfam::HMM::HMMSequence;
+
+use strict;
+use warnings;
+
+use Moose;
+use Moose::Util::TypeConstraints;
+
+extends 'Bio::Pfam::HMM::HMMMatch';
+
+
+has sumEvalue => (
+  isa      => 'evalue',
+  is       => 'rw',
+);
+
+
+has H2mode => (
+  isa => 'Str',
+  is  => 'rw'
+);
+
+has sumScore => (
+  isa      => 'Num',
+  is       => 'rw',
+);
+
+has desc => (
+  isa      => 'Str',
+  is       => 'rw',
+  required => 1
+);
+
+has numberHits => (
+  isa      => 'Int',
+  is       => 'rw',
+  required => 1
+);
+
+
+
+has 'exp' => (
+  isa => 'Num',
+  is  => 'rw'
+);
+
+
+has hmmUnits => (
+  isa => "ArrayRef[ Bio::Pfam::HMM::HMMUnit ]",
+  is  => 'rw',
+  default => sub { [] }
+);
+
+
+#-------------------------------------------------------------------------------
+=head1 Subroutines
+
+=head2 addHMMUnit 
+
+  Title    : addHMMUnit
+  Usage    : $hmmseq->addHMMUnit($hmmUnit) 
+  Function : Adds a hmmUnit to a sequence. It checks that the variable passed in is a Bio::Pfam::HMM::HMMUnit oject
+  Args     : A Bio::Pfam::HMM::HMMUnit oject
+  Returns  : Nothing
+  
+=cut
+
+sub addHMMUnit {
+  my ( $self, $hmmUnit ) = @_;
+  if($hmmUnit->isa("Bio::Pfam::HMM::HMMUnit")){
+    push(@{$self->hmmUnits}, $hmmUnit);
+  }else{
+    warn "$hmmUnit is not a Bio::Pfam::HMM::HMMUnit, not added\n"; 
+  }
+}
+
+
+  __PACKAGE__->meta->make_immutable;
+
+=head1 COPYRIGHT
+
+Copyright (c) 2007: Genome Research Ltd.
+
+Authors: Rob Finn (rdf@sanger.ac.uk), John Tate (jt6@sanger.ac.uk)
+
+This is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; either version 2 of the License, or (at your option) any later
+version.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+details.
+
+You should have received a copy of the GNU General Public License along with
+this program. If not, see <http://www.gnu.org/licenses/>.
+
+=cut
+
+1;
b
diff -r 000000000000 -r 68a3648c7d91 pfamScan/Bio/Pfam/HMM/HMMUnit.pm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pfamScan/Bio/Pfam/HMM/HMMUnit.pm Thu Dec 22 04:45:31 2016 -0500
b
@@ -0,0 +1,138 @@
+
+package Bio::Pfam::HMM::HMMUnit;
+
+use strict;
+use warnings;
+
+use Moose;
+use Moose::Util::TypeConstraints;
+
+extends 'Bio::Pfam::HMM::HMMMatch';
+
+subtype 'Domain'
+    => as 'Int'
+    => where { $_ > 0 };
+              
+#coerce 'Domain'
+#  => from 'Str'
+#    => via {
+#      my $d;
+#      if(/(\d+)\/\d+/){
+#        $d = $1;
+#      }
+#      return $d;
+#    };
+#    
+  
+#subtype 'proteinCoos'
+#  => as 'Int'
+#  => where { $_ > 0 && $_ < 100000 }
+#  => message { 'Protein coordinates are expected to be positive and less the 100,000'};
+
+
+has 'seqEvalue' => (
+  isa      => 'Num',
+  is       => 'rw',
+);
+
+has 'domain' => (
+  isa     => 'Domain',
+  is       => 'rw'
+);
+
+has 'seqFrom' => (
+  isa => 'Int',
+  is   => 'rw',
+  required => 1
+);
+
+has 'seqTo' => (
+  isa => 'Int',
+  is  => 'rw',
+  required => 1
+);
+
+#has 'indEvalue' => (  
+#  isa => 'evalue',
+#  is  => 'rw',
+#  required => 1,
+#);
+
+has 'domEvalue' => (
+  isa => 'evalue',
+  is  => 'rw',
+);
+
+has 'hmmalign' => (
+  isa => 'HashRef',
+  is  => 'rw',
+  default => sub { {} },
+);
+
+has 'hmmFrom' => (
+  isa => 'Int',
+  is   => 'rw',
+  required => 1
+);
+
+has 'hmmTo' => (
+  isa => 'Int',
+  is  => 'rw',
+  required => 1
+);
+
+has 'envFrom' => (
+  isa => 'Int',
+  is   => 'rw'
+);
+
+has 'envTo' => (
+  isa => 'Int',
+  is  => 'rw'
+);
+
+has 'coreFrom' => (
+  isa => 'Str',
+  is   => 'rw'
+);
+
+has 'coreTo' => (
+  isa => 'Str',
+  is  => 'rw'
+);
+
+has 'aliAcc' => (
+  isa => 'Num',
+  is  => 'rw'
+);
+
+has 'sig' => (
+  isa => 'Int',
+  is  => 'rw'
+);
+
+
+__PACKAGE__->meta->make_immutable;
+
+=head1 COPYRIGHT
+
+Copyright (c) 2007: Genome Research Ltd.
+
+Authors: Rob Finn (rdf@sanger.ac.uk), John Tate (jt6@sanger.ac.uk)
+
+This is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; either version 2 of the License, or (at your option) any later
+version.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+details.
+
+You should have received a copy of the GNU General Public License along with
+this program. If not, see <http://www.gnu.org/licenses/>.
+
+=cut
+
+1;
b
diff -r 000000000000 -r 68a3648c7d91 pfamScan/Bio/Pfam/Scan/PfamScan.pm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pfamScan/Bio/Pfam/Scan/PfamScan.pm Thu Dec 22 04:45:31 2016 -0500
b
b'@@ -0,0 +1,957 @@\n+\n+=head1 NAME\n+\n+Bio::Pfam::Scan::PfamScan\n+\n+=cut\n+\n+package Bio::Pfam::Scan::PfamScan;\n+\n+=head1 SYNOPSIS\n+\n+  my $ps = Bio::Pfam::Scan::PfamScan->new(\n+             -cut_off => $hmmscan_cut_off,\n+             -dir => $dir,\n+             -clan_overlap => $clan_overlap,\n+             -fasta => $fasta,\n+             -align => $align,\n+             -as => $as\n+           );\n+\n+  $ps->search;\n+  $ps->write_results;\n+\n+=head1 DESCRIPTION\n+\n+$Id: PfamScan.pm,v 1.4 2010-01-12 09:41:42 jm14 Exp $\n+\n+=cut\n+\n+use strict;\n+use warnings;\n+\n+use Bio::Pfam::HMM::HMMResultsIO;\n+use Bio::Pfam::Active_site::as_search;\n+use Bio::SimpleAlign;\n+use Bio::Pfam::Scan::Seq;\n+\n+use Carp;\n+use IPC::Run qw( start finish );\n+\n+#-------------------------------------------------------------------------------\n+#- constructor -----------------------------------------------------------------\n+#-------------------------------------------------------------------------------\n+\n+=head1 METHODS\n+\n+=head2 new\n+\n+The only constructor for the object. Accepts a set of arguments that specify\n+the parameters for the search:\n+\n+=over\n+\n+=item -cut_off\n+\n+=item -dir\n+\n+=item -clan_overlap\n+\n+=item -fasta\n+\n+=item -sequence\n+\n+=item -align\n+\n+=item -hmm\n+\n+=item -as\n+\n+=back\n+\n+=cut\n+\n+sub new {\n+  my ( $class, @args ) = @_;\n+\n+  my $self = {};\n+  bless $self, $class;\n+\n+  # To avoid hard coding the location for the binary, we assume it will be on the path.....\n+  $self->{_HMMSCAN} = \'hmmscan\';\n+\n+  # handle arguments, if we were given any here\n+  $self->_process_args(@args) if @args;\n+\n+  return $self;\n+}\n+\n+#-------------------------------------------------------------------------------\n+#- public methods --------------------------------------------------------------\n+#-------------------------------------------------------------------------------\n+\n+=head2 search\n+\n+The main method on the object. Performs a C<hmmscan> search using the supplied\n+sequence and the specified HMM library.\n+\n+=cut\n+\n+sub search {\n+  my ( $self, @args ) = @_;\n+\n+  # handle the arguments, if we were handed any here\n+  $self->_process_args(@args) if @args;\n+\n+  # set up the output header\n+  $self->_build_header;\n+\n+  croak qq(FATAL: no sequence given; set the search parameters before calling "search")\n+    unless defined $self->{_sequence};\n+\n+  my ( %AllResults, $pfamB, $firstResult );\n+\n+  foreach my $hmmlib ( @{ $self->{_hmmlib} } ) {\n+\n+    my ( @hmmscan_cut_off, $seq_evalue, $dom_evalue );\n+    if ( $hmmlib !~ /Pfam\\-B/ ) {\n+      @hmmscan_cut_off = @{ $self->{_hmmscan_cutoff} };\n+    }\n+    else {\n+      $pfamB      = 1;\n+      $seq_evalue = 0.001;\n+      $dom_evalue = 0.001;\n+\n+      # It\'s a pfamB search so use some default cut off values\n+      push @hmmscan_cut_off, \'-E\', $seq_evalue, \'--domE\', $dom_evalue;\n+    }\n+\n+    push @{ $self->{_header} },\n+      "#     cpu number specified: " . $self->{_cpu} . "\\n"\n+      if ( $hmmlib !~ /Pfam\\-B/ and $self->{_cpu} );\n+\n+    push @{ $self->{_header} },\n+      "#        searching against: "\n+      . $self->{_dir}\n+      . "/$hmmlib, with cut off "\n+      . join( " ", @hmmscan_cut_off ) . "\\n";\n+    my @params;\n+    if ( $self->{_cpu} ) {\n+      @params = (\n+        \'hmmscan\', \'--notextw\', \'--cpu\', $self->{_cpu}, @hmmscan_cut_off,\n+        $self->{_dir} . \'/\' . $hmmlib,\n+        $self->{_fasta}\n+      );\n+    }\n+    else {\n+      @params = (\n+        \'hmmscan\', \'--notextw\', @hmmscan_cut_off, $self->{_dir} . \'/\' . $hmmlib,\n+        $self->{_fasta}\n+      );\n+\n+    }\n+\n+    print STDERR "PfamScan::search: hmmscan command: |@params|\\n"\n+      if $ENV{DEBUG};\n+    print STDERR \'PfamScan::search: sequence: |\' . $self->{_sequence} . "|\\n"\n+      if $ENV{DEBUG};\n+\n+    my $run = start \\@params, \'<pipe\', \\*IN, \'>pipe\', \\*OUT, \'2>pipe\', \\*ERR\n+      or croak qq(FATAL: error running hmmscan; IPC::Run returned \'$?\');\n+\n+    # print IN $self->{_sequence}; ;\n+    close IN;\n+\n+    $self->{_hmmresultIO} = Bio::Pfam::HMM::HMMResultsIO->ne'..b'->{_fasta} . ".translated";\n+\n+  my @params = ( \'translate\', \'-q\', );\n+  if ( $self->{_translate} eq \'all\' ) {\n+    push( @params, \'-a\' );\n+  }\n+  elsif ( $self->{_translate} eq \'orf\' ) {\n+    push( @params, \'-l\', \'20\' );\n+  }\n+  else {\n+    croak qq(Unexpected parameter \'$self->{_translate}\');\n+  }\n+  push( @params, \'-o\', $translatedFasta, $self->{_fasta} );\n+\n+  print STDERR "PfamScan::translate_fasta: translate command: |@params|\\n"\n+    if $ENV{DEBUG};\n+\n+  my $run = start \\@params, \'<pipe\', \\*IN, \'>pipe\', \\*OUT, \'2>pipe\', \\*ERR\n+    or croak qq(FATAL: error running translate; IPC::Run returned \'$?\');\n+\n+  close IN;\n+  close OUT;\n+\n+  my $err;\n+  while (<ERR>) {\n+    $err .= $_;\n+  }\n+  close ERR;\n+\n+  finish $run\n+    or croak qq|FATAL: error running translate ($err); ipc returned \'$?\'|;\n+  open( F, "<", $translatedFasta )\n+    or croak qw(Could not open $translatedFasta \'$!\');\n+  if ( $self->{_translate} eq \'orf\' ) {\n+    while (<F>) {\n+      if (/^>\\s?(\\S+).*nt (\\d+)\\.+(\\d+)/) {\n+        $self->{_orf}->{$1}->{start}  = $2;\n+        $self->{_orf}->{$1}->{end}    = $3;\n+        $self->{_orf}->{$1}->{strand} = ( $2 < $3 ) ? \'+\' : \'-\';\n+      }\n+    }\n+  }\n+  else {\n+    my $currentSeq;\n+    my $currentFrame;\n+    my $currentLen = 0;\n+    my $maxEnd = 0;\n+    while (<F>) {\n+      chomp;\n+      if (/^>\\s?(\\S+\\:)(\\d+)/) {\n+        if ( $currentLen > 0 ) {\n+          my $seqName = $currentSeq . $currentFrame;\n+          if ( $currentFrame < 3 ) {\n+            my $start = 1 + $currentFrame;\n+            my $end   = $start + $currentLen - 1;\n+            $self->{_orf}->{$seqName}->{strand} = \'+\';\n+            $self->{_orf}->{$seqName}->{start}  = $start;\n+            $self->{_orf}->{$seqName}->{end}    = $end;\n+            $maxEnd = $end if ( $end > $maxEnd );\n+          }\n+          else {\n+            my $start = $maxEnd - ( $currentFrame - 3 );\n+            my $end = $start - $currentLen + 1;\n+            $self->{_orf}->{$seqName}->{strand} = \'-\';\n+            $self->{_orf}->{$seqName}->{start}  = $start;\n+            $self->{_orf}->{$seqName}->{end}    = $end;\n+          }\n+        }\n+        $currentLen   = 0;\n+        $currentSeq   = $1;\n+        $currentFrame = $2;\n+      }\n+      else {\n+        $currentLen += length($_) * 3;\n+      }\n+    }\n+    my $seqName = $currentSeq . $currentFrame;\n+    if ( $currentFrame < 3 ) {\n+      my $start = 1 + $currentFrame;\n+      my $end   = $start + $currentLen - 1;\n+      $self->{_orf}->{$seqName}->{strand} = \'+\';\n+      $self->{_orf}->{$seqName}->{start}  = $start;\n+      $self->{_orf}->{$seqName}->{end}    = $end;\n+      $maxEnd = $end if ( $end > $maxEnd );\n+    }\n+    else {\n+      my $start = $maxEnd - ( $currentFrame - 3 );\n+      my $end = $start - $currentLen + 1;\n+      $self->{_orf}->{$seqName}->{strand} = \'-\';\n+      $self->{_orf}->{$seqName}->{start}  = $start;\n+      $self->{_orf}->{$seqName}->{end}    = $end;\n+    }\n+  }\n+  $self->{_fasta} = $translatedFasta;\n+}\n+#-------------------------------------------------------------------------------\n+\n+=head1 COPYRIGHT\n+\n+Copyright (c) 2009: Genome Research Ltd.\n+\n+Authors: Jaina Mistry (jm14@sanger.ac.uk), John Tate (jt6@sanger.ac.uk), Rob Finn (finnr@janelia.hhmi.org)\n+\n+This is free software; you can redistribute it and/or\n+modify it under the terms of the GNU General Public License\n+as published by the Free Software Foundation; either version 2\n+of the License, or (at your option) any later version.\n+\n+This program is distributed in the hope that it will be useful,\n+but WITHOUT ANY WARRANTY; without even the implied warranty of\n+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n+GNU General Public License for more details.\n+\n+You should have received a copy of the GNU General Public License\n+along with this program; if not, write to the Free Software\n+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.\n+or see the on-line version at http://www.gnu.org/copyleft/gpl.txt\n+ \n+=cut\n+\n+  1;\n+\n'
b
diff -r 000000000000 -r 68a3648c7d91 pfamScan/Bio/Pfam/Scan/Seq.pm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pfamScan/Bio/Pfam/Scan/Seq.pm Thu Dec 22 04:45:31 2016 -0500
b
@@ -0,0 +1,57 @@
+package Bio::Pfam::Scan::Seq;
+
+use strict;
+use warnings;
+
+use Bio::LocatableSeq;
+use Bio::Seq::RichSeq;
+
+use base qw(Bio::LocatableSeq Bio::Seq::RichSeq);
+
+sub new {
+  my($class, %params ) = @_;
+  my( $id, $start, $end, $seq) =
+      (
+       ($params{'-ID'}          || $params{'-id'}),
+       ($params{'-START'}       || $params{'-start'}),
+       ($params{'-END'}         || $params{'-end'}),
+       ($params{'-SEQ'}         || $params{'-seq'}),
+       );
+
+  my $self = $class->SUPER::new( %params );  # this is Bio::Pfam::Root
+                      # so we have to set Bio::LocatableSeq fields ourself
+
+
+
+
+  $self->id( $id );
+  $self->start( $start );
+  $self->end( $end );
+  $self->seq( $seq );
+
+
+  return $self; # success - we hope!
+}
+
+=head1 COPYRIGHT
+
+Copyright (c) 2007: Genome Research Ltd.
+
+Authors: Rob Finn (rdf@sanger.ac.uk), John Tate (jt6@sanger.ac.uk)
+
+This is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; either version 2 of the License, or (at your option) any later
+version.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+details.
+
+You should have received a copy of the GNU General Public License along with
+this program. If not, see <http://www.gnu.org/licenses/>.
+
+=cut
+
+1
b
diff -r 000000000000 -r 68a3648c7d91 pfamScan/htt.fas
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pfamScan/htt.fas Thu Dec 22 04:45:31 2016 -0500
b
@@ -0,0 +1,42 @@
+>gi|296434520|sp|P42858.2|HD_HUMAN RecName: Full=Huntingtin; AltName: Full=Huntington disease protein; Short=HD protein
+MATLEKLMKAFESLKSFQQQQQQQQQQQQQQQQQQQQQPPPPPPPPPPPQLPQPPPQAQPLLPQPQPPPP
+PPPPPPGPAVAEEPLHRPKKELSATKKDRVNHCLTICENIVAQSVRNSPEFQKLLGIAMELFLLCSDDAE
+SDVRMVADECLNKVIKALMDSNLPRLQLELYKEIKKNGAPRSLRAALWRFAELAHLVRPQKCRPYLVNLL
+PCLTRTSKRPEESVQETLAAAVPKIMASFGNFANDNEIKVLLKAFIANLKSSSPTIRRTAAGSAVSICQH
+SRRTQYFYSWLLNVLLGLLVPVEDEHSTLLILGVLLTLRYLVPLLQQQVKDTSLKGSFGVTRKEMEVSPS
+AEQLVQVYELTLHHTQHQDHNVVTGALELLQQLFRTPPPELLQTLTAVGGIGQLTAAKEESGGRSRSGSI
+VELIAGGGSSCSPVLSRKQKGKVLLGEEEALEDDSESRSDVSSSALTASVKDEISGELAASSGVSTPGSA
+GHDIITEQPRSQHTLQADSVDLASCDLTSSATDGDEEDILSHSSSQVSAVPSDPAMDLNDGTQASSPISD
+SSQTTTEGPDSAVTPSDSSEIVLDGTDNQYLGLQIGQPQDEDEEATGILPDEASEAFRNSSMALQQAHLL
+KNMSHCRQPSDSSVDKFVLRDEATEPGDQENKPCRIKGDIGQSTDDDSAPLVHCVRLLSASFLLTGGKNV
+LVPDRDVRVSVKALALSCVGAAVALHPESFFSKLYKVPLDTTEYPEEQYVSDILNYIDHGDPQVRGATAI
+LCGTLICSILSRSRFHVGDWMGTIRTLTGNTFSLADCIPLLRKTLKDESSVTCKLACTAVRNCVMSLCSS
+SYSELGLQLIIDVLTLRNSSYWLVRTELLETLAEIDFRLVSFLEAKAENLHRGAHHYTGLLKLQERVLNN
+VVIHLLGDEDPRVRHVAAASLIRLVPKLFYKCDQGQADPVVAVARDQSSVYLKLLMHETQPPSHFSVSTI
+TRIYRGYNLLPSITDVTMENNLSRVIAAVSHELITSTTRALTFGCCEALCLLSTAFPVCIWSLGWHCGVP
+PLSASDESRKSCTVGMATMILTLLSSAWFPLDLSAHQDALILAGNLLAASAPKSLRSSWASEEEANPAAT
+KQEEVWPALGDRALVPMVEQLFSHLLKVINICAHVLDDVAPGPAIKAALPSLTNPPSLSPIRRKGKEKEP
+GEQASVPLSPKKGSEASAASRQSDTSGPVTTSKSSSLGSFYHLPSYLKLHDVLKATHANYKVTLDLQNST
+EKFGGFLRSALDVLSQILELATLQDIGKCVEEILGYLKSCFSREPMMATVCVQQLLKTLFGTNLASQFDG
+LSSNPSKSQGRAQRLGSSSVRPGLYHYCFMAPYTHFTQALADASLRNMVQAEQENDTSGWFDVLQKVSTQ
+LKTNLTSVTKNRADKNAIHNHIRLFEPLVIKALKQYTTTTCVQLQKQVLDLLAQLVQLRVNYCLLDSDQV
+FIGFVLKQFEYIEVGQFRESEAIIPNIFFFLVLLSYERYHSKQIIGIPKIIQLCDGIMASGRKAVTHAIP
+ALQPIVHDLFVLRGTNKADAGKELETQKEVVVSMLLRLIQYHQVLEMFILVLQQCHKENEDKWKRLSRQI
+ADIILPMLAKQQMHIDSHEALGVLNTLFEILAPSSLRPVDMLLRSMFVTPNTMASVSTVQLWISGILAIL
+RVLISQSTEDIVLSRIQELSFSPYLISCTVINRLRDGDSTSTLEEHSEGKQIKNLPEETFSRFLLQLVGI
+LLEDIVTKQLKVEMSEQQHTFYCQELGTLLMCLIHIFKSGMFRRITAAATRLFRSDGCGGSFYTLDSLNL
+RARSMITTHPALVLLWCQILLLVNHTDYRWWAEVQQTPKRHSLSSTKLLSPQMSGEEEDSDLAAKLGMCN
+REIVRRGALILFCDYVCQNLHDSEHLTWLIVNHIQDLISLSHEPPVQDFISAVHRNSAASGLFIQAIQSR
+CENLSTPTMLKKTLQCLEGIHLSQSGAVLTLYVDRLLCTPFRVLARMVDILACRRVEMLLAANLQSSMAQ
+LPMEELNRIQEYLQSSGLAQRHQRLYSLLDRFRLSTMQDSLSPSPPVSSHPLDGDGHVSLETVSPDKDWY
+VHLVKSQCWTRSDSALLEGAELVNRIPAEDMNAFMMNSEFNLSLLAPCLSLGMSEISGGQKSALFEAARE
+VTLARVSGTVQQLPAVHHVFQPELPAEPAAYWSKLNDLFGDAALYQSLPTLARALAQYLVVVSKLPSHLH
+LPPEKEKDIVKFVVATLEALSWHLIHEQIPLSLDLQAGLDCCCLALQLPGLWSVVSSTEFVTHACSLIYC
+VHFILEAVAVQPGEQLLSPERRTNTPKAISEEEEEVDPNTQNPKYITAACEMVAEMVESLQSVLALGHKR
+NSGVPAFLTPLLRNIIISLARLPLVNSYTRVPPLVWKLGWSPKPGGDFGTAFPEIPVEFLQEKEVFKEFI
+YRINTLGWTSRTQFEETWATLLGVLVTQPLVMEQEESPPEEDTERTQINVLAVQAITSLVLSAMTVPVAG
+NPAVSCLEQQPRNKPLKALDTRFGRKLSIIRGIVEQEIQAMVSKRENIATHHLYQAWDPVPSLSPATTGA
+LISHEKLLLQINPERELGSMSYKLGQVSIHSVWLGNSITPLREEEWDEEEEEEADAPAPSSPPTSPVNSR
+KHRAGVDIHSCSQFLLELYSRWILPSSSARRTPAILISEVVRSLLVVSDLFTERNQFELMYVTLTELRRV
+HPSEDEILAQYLVPATCKAAAVLGMDKAVAEPVSRLLESTLRSSHLPSRVGALHGVLYVLECDLLDDTAK
+QLIPVISDYLLSNLKGIAHCVNIHSQQHVLVMCATAFYLIENYPLDVGPEFSASIIQMCGVML
b
diff -r 000000000000 -r 68a3648c7d91 pfamScan/pfamScan.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pfamScan/pfamScan.xml Thu Dec 22 04:45:31 2016 -0500
b
@@ -0,0 +1,32 @@
+<tool id="pfamScam" name="PFAM Annotator">
+ <description>Annotate PFAM domains. </description>
+ <command> /home/inmare/galaxy/tools/pfamScan/pfam_scan.pl -fasta $input -cpu 2 -dir /home/inmare/galaxy/tools/pfamScan/hmm > $output</command>
+
+ <inputs>
+ <param format="fasta" name="input" type="data" label="Source file"/>
+ </inputs>
+
+ <outputs>
+ <data format="tabular" name="output" />
+ </outputs>
+
+ <tests>
+ <test>
+ <param name="input" value="htt.fas"/>
+ <output name="out_file1" file="htt.dom.txt"/>
+ </test>
+ </tests>
+ <citations>
+ Predicting active site residue annotations in the Pfam database.
+ (2007) BMC bioinformatics 8 :298
+ PMID: 17688688
+
+ The EMBL-EBI bioinformatics web and programmatic tools framework.
+ (2015 July 01) Nucleic acids research 43 (W1) :W580-4
+ PMID: 25845596
+
+ </citations>
+ <help>
+ The pfam_scan.pl script is used to annotate PFAM domains on the input file sequences. Only PFAM A class domains are predicted. See http://www.ebi.ac.uk/Tools/pfa/pfamscan/help/ for instructions on how to obtain pfamSCAN and for a brief description of the output format.
+ </help>
+</tool>
b
diff -r 000000000000 -r 68a3648c7d91 pfamScan/pfam_scan.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pfamScan/pfam_scan.pl Thu Dec 22 04:45:31 2016 -0500
[
b'@@ -0,0 +1,338 @@\n+#!/usr/bin/env perl\n+\n+# $Id: pfam_scan.pl 9045 2015-05-26 09:09:52Z rdf $\n+\n+use strict;\n+use warnings;\n+\n+BEGIN {push @INC,"/home/inmare/galaxy/tools/pfamScan"}\n+use Bio::Pfam::Scan::PfamScan;\n+use Getopt::Long;\n+\n+my $VERSION = "1.5"; \n+\n+#-------------------------------------------------------------------------------\n+\n+# get the user options\n+my ( $outfile, $e_seq, $e_dom, $b_seq, $b_dom, $dir, \n+     $clan_overlap, $fasta, $align, $help, $as, $pfamB, \n+     $json, $only_pfamB, $cpu, $translate );\n+GetOptions( \'help\'         => \\$help,\n+            \'outfile=s\'    => \\$outfile,\n+            \'e_seq=f\'      => \\$e_seq,\n+            \'e_dom=f\'      => \\$e_dom,\n+            \'b_seq=f\'      => \\$b_seq,\n+            \'b_dom=f\'      => \\$b_dom,\n+            \'dir=s\'        => \\$dir,\n+            \'clan_overlap\' => \\$clan_overlap,\n+            \'fasta=s\'      => \\$fasta,\n+            \'align\'        => \\$align,\n+            \'h\'            => \\$help,\n+            \'as\'           => \\$as,\n+            \'pfamB\'        => \\$pfamB,\n+            \'only_pfamB\'   => \\$only_pfamB,\n+            \'json:s\'       => \\$json,\n+            \'cpu=i\'        => \\$cpu,\n+            \'translate:s\'  => \\$translate\n+);\n+\n+help() if $help;\n+help() unless ( $dir and $fasta ); # required options\n+\n+my $pfamA;\n+if ( $only_pfamB or $pfamB ) {\n+  die qq(FATAL: As of release 28.0, Pfam no longer produces Pfam-B. The -pfamB and -only_pfamB options are now obsolete.\\n);\n+  $pfamB=1;\n+}\n+else {\n+  $pfamA=1;\n+}\n+\n+my @hmmlib;\n+push @hmmlib, \'Pfam-A.hmm\' if $pfamA;\n+push @hmmlib, \'Pfam-B.hmm\' if $pfamB;\n+\n+#-------------------------------------------------------------------------------\n+\n+# check the input parameters\n+\n+die qq(FATAL: must specify both "-dir" and "-fasta")\n+  unless ( defined $dir and defined $fasta );\n+\n+die qq(FATAL: can\'t find directory "$dir")\n+  unless -d $dir;\n+\n+die qq(FATAL: can\'t find file "$fasta")\n+  unless -s $fasta;\n+\n+foreach my $hmmlib ( @hmmlib ) {\n+  die qq(FATAL: can\'t find "$hmmlib" and/or "$hmmlib" binaries and/or "$hmmlib.dat" file in "$dir")\n+    unless ( -s "$dir/$hmmlib"     and \n+             -s "$dir/$hmmlib.h3f" and\n+             -s "$dir/$hmmlib.h3i" and\n+             -s "$dir/$hmmlib.h3m" and\n+             -s "$dir/$hmmlib.h3p" and\n+             -s "$dir/$hmmlib.dat" );\n+}\n+\n+die qq(FATAL: can\'t use E-value or bit score threshold with Pfam-B searches; Pfam-B searches use a default cut_off of 0.001)\n+  if ( ( $e_seq or $e_dom or $b_seq or $b_dom ) and not $pfamA ); \n+\n+die qq(FATAL: can\'t use E-value and bit score threshold together)\n+  if ( ( $e_seq and ( $b_seq or $b_dom ) ) or \n+       ( $b_seq and ( $e_seq or $e_dom ) ) or \n+       ( $b_dom and $e_dom ) );\n+\n+die qq(FATAL: output file "$outfile" already exists)\n+  if ( $outfile and -s $outfile );\n+\n+if ( $as ) {\n+  die qq(FATAL: "-as" option only works on Pfam-A families)\n+    unless $pfamA;\n+\n+  die qq(FATAL: can\'t find "active_site.dat" in "$dir")\n+    unless -s "$dir/active_site.dat";\n+}\n+\n+if ( defined $translate ) {\n+  if ( $translate eq "" ) {\n+    # no argument to "-translate" was given, so make "orf" the default\n+    $translate = \'orf\';\n+  }\n+  else {\n+    # there was an argument to "-translate", so make sure it\'s valid\n+    unless ( $translate eq "all" or $translate eq "orf" ) {\n+      die qq(FATAL: "-translate" option accepts only "all" and "orf");\n+    }\n+  }\n+}\n+\n+#-------------------------------------------------------------------------------\n+\n+# build the object\n+my $ps = Bio::Pfam::Scan::PfamScan->new(\n+  -e_seq        => $e_seq,\n+  -e_dom        => $e_dom,\n+  -b_seq        => $b_seq,\n+  -b_dom        => $b_dom,\n+  -dir          => $dir,\n+  -clan_overlap => $clan_overlap,\n+  -fasta        => $fasta,\n+  -align        => $align,\n+  -as           => $as,\n+  -hmmlib       => \\@hmmlib,\n+  -version      => $VERSION,\n+  -cpu          => $cpu,\n+  -translate    => $translate\n+);\n+\n+# run the search\n+$ps->search;\n+\n+# print the results\n+if ( defined $json'..b'ff [default: use Pfam GA cutoff]\n+\n+=item B<-e_dom> \n+\n+Domain E-value cut-off [default: use Pfam GA cutoff]\n+\n+=item B<-b_seq>\n+\n+Sequence bits score cut-off [default: use Pfam GA cutoff]\n+\n+=item B<-b_dom>\n+\n+Domain bits score cut-off [default: use Pfam GA cutoff]\n+\n+=item B<-clan_overlap>\n+\n+Allow sequences in different clans to overlap [default: false]\n+\n+=item B<-align>\n+\n+Show alignment snippets in results [default: false]\n+\n+=item B<-as>\n+\n+Search for active sites on Pfam-A matches [default: false]\n+\n+=item B<-json> [I<pretty>]\n+\n+Write the results in JSON format [default: false]\n+\n+=item B<-cpu>\n+\n+Number of parallel CPU workers to use for multithreads [default: all]\n+\n+=item B<-translate> [I<mode>]\n+\n+Treat the input sequence as DNA and perform a six-frame translation before\n+searching, using the "translate" program from the HMMER v2.3.2 package. If the\n+optional value I<mode> is given, it must be either "all" or "orf": "all" means\n+translate in full, with stops, and produce no individual ORFs; "orf" means\n+translate and report only ORFs of length greater than 20. If B<translate> is\n+used but I<mode> is omitted, the default is to translate using the "orf"\n+method [default: off (no translation)]\n+\n+=item B<-h>\n+\n+Display help message\n+\n+=back\n+\n+The input must be a FASTA-format file. The C<-fasta> and C<-dir> options are \n+mandatory. You cannot specify both an E-value and bits score threshold.  \n+\n+=head1 OVERVIEW\n+\n+C<pfam_scan.pl> is a script for searching one or more protein sequences against the\n+library of HMMs from Pfam. It requires a local copy of the Pfam data files, which \n+can be obtained from the Pfam FTP area:\n+\n+  ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/\n+\n+You must also have the HMMER3 binaries installed and their locations given by your\n+C<PATH> environment variable. You can download the HMMER3 package at:\n+\n+  ftp://selab.janelia.org/pub/software/hmmer3/\n+\n+=head1 OUTPUT\n+\n+The output format is:\n+<seq id> <alignment start> <alignment end> <envelope start> <envelope end> <hmm acc> <hmm name> <type> <hmm start> <hmm end> <hmm length> <bit score> <E-value> <significance> <clan> <predicted_active_site_residues>\n+Example output (-as option):\n+\n+  O65039.1     38     93     38     93 PF08246   Inhibitor_I29     Domain     1    58    58     45.9   2.8e-12   1 No_clan\n+  O65039.1    126    342    126    342 PF00112   Peptidase_C1      Domain     1   216   216    296.0   1.1e-88   1 CL0125   predicted_active_site[150,285,307]\n+\n+Most of these values are derived from the output of I<hmmscan> (see HMMER3\n+documentation for details). The significance value is 1 if the bit score for a\n+hit is greater than or equal to the curated gathering threshold for the\n+matching family, 0 otherwise. \n+\n+=head1 REFERENCES\n+\n+Active site residues are predicted using the method described in the publication: \n+\n+Mistry J., Bateman A., Finn R.D. "Predicting active site residue annotations in \n+the Pfam database." BMC Bioinformatics. 2007;8:298. PMID:17688688.\n+\n+=head1 AUTHORS\n+\n+Jaina Mistry (jaina@ebi.ac.uk), Rob Finn (rdf@ebi.ac.uk)\n+\n+=cut\n+\n+=head1 COPYRIGHT\n+\n+Copyright (c) 2009: Genome Research Ltd.\n+\n+Authors: Jaina Mistry (jaina@ebi.ac.uk), rdf (rdf@ebi.ac.uk)\n+\n+This is free software; you can redistribute it and/or\n+modify it under the terms of the GNU General Public License\n+as published by the Free Software Foundation; either version 2\n+of the License, or (at your option) any later version.\n+\n+This program is distributed in the hope that it will be useful,\n+but WITHOUT ANY WARRANTY; without even the implied warranty of\n+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n+GNU General Public License for more details.\n+\n+You should have received a copy of the GNU General Public License\n+along with this program; if not, write to the Free Software\n+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.\n+or see the on-line version at http://www.gnu.org/copyleft/gpl.txt\n+ \n+=cut\n+\n'
b
diff -r 000000000000 -r 68a3648c7d91 pfam_annot/annota.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pfam_annot/annota.pl Thu Dec 22 04:45:31 2016 -0500
[
@@ -0,0 +1,221 @@
+#!/usr/bin/perl -w
+
+use strict;
+my $d_file="/home/inmare/galaxy/tools/pfam_annot/pfamA.txt";
+open(IN,$d_file);
+my %decode=();
+my %clan_decode;
+my $id="";
+my %c=();
+
+
+my $prot_file=shift;
+my $pfam_file=shift;
+my $prefix=shift;
+
+while(<IN>)
+{
+        if ($_=~/^\d/)
+        {
+                my @vl=(split(/\t+/));
+ $decode{$vl[1]}="$vl[3]<br>";#$vl[8] $vl[9]";
+                my $cc=0;
+ my %repeated=();
+ foreach my $v (@vl)
+                {
+ $v=~s/\[\d+\]/ /g;
+ last if $v=~/hmmbuild/;
+ last if $cc>10;
+ next if $v=~/anon/;
+ next if $v=~/Bates/;
+ next if $v=~/Cogis/;
+ next if $v=~/Coggis/;
+ next if $v=~/Bateman/;
+ next if $v=~/Sonnhammer/;
+ next if $v=~/Finn/;
+ next if $v=~/Studholme/;
+ next if $v=~/Kerrison/;
+ next if $repeated{$v};
+ next if $v eq $vl[3];
+ next unless length($v)>=20 && $cc<=9;
+ $decode{$vl[1]}.="$v ";
+                 $repeated{$v}++;
+ $cc++;
+ }
+ #print "$vl[1] $decode{$vl[1]}\n";
+        }
+}
+close(IN);
+
+my $clan_file="/home/inmare/galaxy/tools/pfam_annot/clans.txt";
+open(IN,$clan_file);
+while(<IN>)
+{
+        my @vl=(split(/\t/));
+        #$clan_decode{$vl[1]}="$vl[3]";
+ my $cc=0;
+        foreach my $v (@vl)
+ {
+ $cc++;
+ $v=~s/\[\d+\]/ /g;
+ $clan_decode{$vl[1]}.="$v " if length($v) >=30 && $cc<=10;
+ }
+
+}
+my %plasm=();
+open(IN,"$prot_file");
+while(<IN>)
+{
+        if ($_=~/^>(.*)/)
+        {
+                $id=$1;
+ $id=(split(/\s+/,$id))[0];
+ if ($id=~/#/)
+ {
+ my $pid=(split(/\#/,$id))[0];
+ $plasm{$pid}++;
+ }
+        }else{
+ chomp;
+                $c{$id}.=$_;
+        }
+}
+close(IN);
+
+open(OUT,">$prefix");
+print OUT "<html>\n<head>\n";
+print OUT "<style type=\"text/css\">\nspan {\n\ttext-decoration:underline;\n\tcolor:blue;\n\tcursor:pointer;\n}\n</style>\n";
+print OUT "<script>\nfunction show(elementID) {\n\tvar ele = document.getElementById(elementID);\n\tif (!ele) {\n\t\talert(\"no such element\");\t\treturn;\n\t}\n\tvar pages = document.getElementsByClassName('page');\n\tfor(var i = 0; i < pages.length; i++) {\n\t\tpages[i].style.display = 'none';\n\t}\n\tele.style.display = 'block';\n}\n</script>\n";
+#print OUT "<script src=\"script.js\"></script>\n";
+#print OUT "<script>\n\tshow(elementID)\n</script>\n";
+print OUT "</head>\n";
+my $color="\"#czb9dz\"";
+my %printed;
+open(IN,$pfam_file);
+print OUT "Proteins with PFAM domains:\n<br><br>\n";
+my @dd=keys %plasm;
+if ($#dd>0)
+{
+ print OUT "<p>Show results ";
+ for (my $i=0;$i<=$#dd;$i++)
+ {
+ if ($i==$#dd)
+ {
+ print OUT "<span onclick=\"show(\'$dd[$i]\');\">$dd[$i]</span>.</p>\n";
+ }else{
+ print OUT "<span onclick=\"show(\'$dd[$i]\');\">$dd[$i]</span>,\n"; 
+ }
+ }
+}else{
+ print OUT "<div>\n<table cellpadding=\"0\" width=650>\n"; #div per ogni plasmide|
+}
+my $ntokens=0;
+my $prev_plasmid="";
+my $curr_plasmid="";
+my $np=0;
+while(<IN>)
+{
+        next if $_=~/^\#/;
+        my ($name,$domain,$clan)=(split(/\s+/))[0,5,-1];
+        next unless $name;
+ if ($name=~/#/)
+        {
+         $curr_plasmid=(split(/\#/,$name))[0];
+ if ($curr_plasmid ne $prev_plasmid)
+ {
+ if ($np>0)
+ {
+ print OUT "</table>\n";
+ print OUT "</div>\n";
+ }
+ print OUT "<div id=\"$curr_plasmid\" class=\"page\" style=\"\">\n";
+ print OUT "<table cellpadding=\"0\" width=650>\n";
+ $np++;
+ }
+ $prev_plasmid=$curr_plasmid;   
+        }
+
+        $domain=~s/\.\d+//;
+        unless ($printed{$name})
+ {
+ my $seq=$c{$name};
+
+ $seq=~s/\*//g;
+ $seq=form($seq,90);
+ print OUT "<td>\n";
+ print OUT "<HR SIZE=3 WIDTH=80%></HR>\n";
+ print OUT "<center><b>$name</b><br>\n</center>\n";
+ print OUT "</td>\n<tr></tr>\n";
+                print OUT "<td bgcolor=$color>\n";
+ print OUT "<pre> \n$seq\n </pre>\n";
+ print OUT "</td>\n<tr></tr>\n<td></td>\n<tr></tr>\n";
+ $ntokens=2;
+ }
+ my $hd=uc $domain;
+ #<a href="http://www.canoro.altervista.org/" class="nav" target="_blank">www.canoro.altervista.org</a>
+ if ($decode{$domain})
+ {
+ my $ddes=$decode{$domain};
+ $ddes=~s/\s+/ /g;
+ if ($ntokens % 2==0)
+                {
+                        print OUT "<td>\n";
+                }else{
+                        print OUT "<td bgcolor=$color>\n";
+                }
+
+ print OUT "<p align=\"left\">\n";
+         print OUT "<a href=http://pfam.xfam.org/family/$hd> $domain</a>\n<p align=\"justify\">$ddes</p>\n\n";
+ print OUT "</p>\n</td>\n<tr></tr>\n<td></td>\n<tr></tr>\n";
+ $ntokens++;
+ }
+        if ($clan_decode{$clan})
+        {
+                my $clanD=$clan_decode{$clan};
+ $clanD=~s/\s+/ /g;
+ next if $decode{$domain} eq $clan_decode{$clan};
+ my $ddes=$decode{$domain};
+                if ($ntokens % 2==0)
+                {
+                        print OUT "<td>\n";
+                }else{
+                        print OUT "<td bgcolor=$color>\n";
+                }
+
+                print OUT "<p align=\"left\">\n";
+ print OUT "<a href=http://pfam.xfam.org/clan/$clan> $clan</a>\n <p align=\"justify\">$clanD</p>\n\n";
+ print OUT "</p>\n</td>\n<tr></tr>\n<td></td>\n<tr></tr>\n";
+ $ntokens++;
+        }
+        $printed{$name}=1;
+}
+
+#if ($curr_plasmid ne "")
+#{
+ print OUT "</table>\n";
+ print OUT "</div>\n";
+ print OUT "</body>\n";
+#}
+#print OUT "<br><br>Proteins without PFAM domains:\n<br>\n";
+#foreach my $seq (keys %c)
+#{
+#        next if $printed{$seq};
+#        print OUT "<>$seq</pre>\n\n<br><br><left>\n$c{$seq}</left><br>\n";
+# print OUT "<HR SIZE=3 WIDTH=80%>\n";
+#}
+#print OUT "</table>\n</div>\n</body>\n</html>\n";
+#close(OUT);
+
+sub form
+{
+ my $string=$_[0];
+ my $len=$_[1];
+ my $outS="";
+ for (my $i=0;$i<=length($string);$i+=$len)
+ {
+ $outS.=substr($string,$i,$len)."\n";
+ }
+ #print "A:$outS";
+ #$outS=~s/\s+//g;
+ return $outS;
+}
b
diff -r 000000000000 -r 68a3648c7d91 pfam_annot/clans.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pfam_annot/clans.txt Thu Dec 22 04:45:31 2016 -0500
[
b"@@ -0,0 +1,515 @@\n+1\tCL0001\tEGF\t\\N\tEGF superfamily\tFinn RD, Bateman A\tanon\tMembers of this clan all belong to the EGF superfamily.  This particular superfamily is characterised as having least 6 cysteines residues.\\\t\\\t\\\t\\\t   These cysteine form disulphide bonds, in the order 1-3, 2-4, 5-6, which are essential for the stability of  the EGF fold.    These disulphide bonds are stacked in a ladder-like arrangement.  The Laminin EGF family is distinguished by having an an additional disulphide bond.  The function of the domains within  this family remains unclear, but they are though to largely  perform a structural role.  More often than not, there domains are arranged a tandem repeats in extracellular proteins.\t2008-09-03 15:50:29\t2004-03-17 16:02:08\t26\t325\t6259\t696\t88541\t1\n+3\tCL0003\tSAM\t\\N\tSterile Alpha Motif (SAM) domain\tFinn RD\tanon\tSAM domains are found in a diverse set of proteins, which include  scaffolding proteins, transcription regulators, translational  regulators tyrosine kinases and serine/threonine kinases [1-3].   SAM domains are found in all eukaryotes and some bacteria [3] .   Structures of SAM domains reveal a common five helical structure.  The SAM domain is involved in a variety of functions. The most widespread function is in domain-domain interactions.   The SAM domain performs domain-domain interactions using multifarious  arrangements of the SAM domain.  More recently, the SAM domain within  the Smaug protein has been demonstrated to bind to the Nanos 3' UTR translation control element (Rfam:RF00161) [3]. This clan currently only represents the diverse SAM domain family and does not contain the more divergent SAM/Pointed family (Pfam:PF02198).\t2008-09-03 15:50:29\t2004-03-17 16:21:50\t20\t126\t742\t467\t11010\t1\n+4\tCL0004\tConcanavalin\t\\N\tConcanavalin-like lectin/glucanase superfamily\tBateman A\tanon\tThis superfamily includes a diverse range of carbohydrate binding domains and glycosyl hydrolase enzymes that share a common structure.\t2008-09-03 15:50:29\t2004-03-17 16:44:11\t19\t1631\t2750\t3131\t34755\t1\n+5\tCL0005\tKazal\t\\N\tKazal like domain\tFinn RD\tanon\tKazal domains are found in both serine protease inhibitors and extracellular regions of agrins. The structure of the Kazal domain is a small alpha/beta fold. Typically the Kazal domain consists of 2 short-helices and a  3-stranded anti-parallel sheet.  The fold is contains several disulphide bonds. \t2008-09-03 15:50:29\t2004-03-17 17:00:11\t26\t106\t337\t450\t6552\t1\n+6\tCL0006\tC1\t\\N\tProtein kinase C, C1 domain\tFinn RD\tanon\tThe members of this clan are all variations of the protein kinase C1 domain that is characterised by a rich cysteine and histidine content.  The C1 domain is the N-terminal region of conservation found in protein kinase C domains.  This domain is involved in binding many ligands, which include diacylglycerol, phorbol  esters and zinc [1].\t2008-09-03 15:50:29\t2004-03-17 17:47:56\t19\t30\t728\t396\t10495\t1\n+7\tCL0007\tKH\t\\N\tK-Homology (KH) domain Superfamily\tFinn RD\tanon\tThe KH domain is thought to be the second most prevalent RNA  binding motif in proteins.  The motif is characterised by a  conserved GXXXGXXG in the middle of the domain.  Structures of KH reveal that the KH domain is arranged as either a  beta-alpha-alpha-beta-beta (mini-KH domain) or  beta-alpha-alpha-beta-beta-alpha (maxi-KH domain).  The secondary elements are separated by at least four loop segments. The second loop is located between beta-1 and al  The KH domain can be found either as single or multiple copies.  The KH domain usually binds RNA as a multimer.\t2008-09-03 15:50:29\t2004-03-17 17:58:30\t17\t312\t491\t5344\t38636\t1\n+9\tCL0009\tENTH_VHS\t\\N\tENTH/ANTH/VHS superfamily\tBateman A, McMahon H\tanon\tThis clan includes the related ENTH and ANTH domains as well as the VHS domain. The ENTH domain is approximately 150 residues in length and is a solenoid of alpha-helices. The various ENTH domains have various lipid specificities but the key feature that distinguishes it functionally from ANTH domains is it"..b'phorylated tyrosine residues on its interacting protein-partner.\t2012-02-21 16:19:05\t2012-02-21 16:19:05\t1\t387\t421\t453\t9393\t1\n+542\tCL0542\tRAS_GEF_N\t\tRas guanyl-nucleotide exchange factor activity N-term\tCoggill P\tpcc\tThis is the more N-terminal domain of the RAS-GEF superfamily.\t2012-02-29 11:57:43\t2012-02-29 11:57:43\t1\t14\t157\t277\t2682\t1\n+543\tCL0543\tViral_gly_cn_dm\t\tViral glycoprotein central and dimerisation domains\tEberhardt R\tre3\tFlavivirus and alpha virus glycoprotein E/E1 consists of three domains. A dimerisation domain, a central domain and an immunoglobulin-like domain. The dimerisation and central domains are intertwined [1-3].\t2012-03-23 15:37:00\t2012-03-23 15:37:00\t1\t93\t37\t200\t12815\t1\n+544\tCL0544\tAcylCoA_ox_dh_N\t\tAcyl-coenzyme A oxidase/dehydrogenase N-terminal\tEberhardt R\tre3\tAcyl-CoA dehydrogenases and acyl-coenzyme A oxidases consist of three domains. An N-terminal all alpha domain, a beta-barrel middle domain and a C-terminal catalytic domain [1-2].\t2012-03-27 08:17:33\t2012-03-27 09:17:33\t1\t223\t112\t3478\t26792\t1\n+545\tCL0545\tAPCOP-app_sub\t\tClathrin (AP) and COPI appendage platform  subdomain\tCoggill P\tpcc\tThis superfamily is characterised by subdomains from the clathrin and coatomer appendages. The superfamily possesses a single protein/protein interaction site that in yeast binds to the ARFGAP Glo3p, and in mammalian gamma-COP binds to a Glo3p orthologue, ARFGAP2 [1].\t2012-04-18 13:17:34\t2012-04-18 14:17:34\t1\t0\t11\t303\t446\t1\n+546\tCL0546\tHexosaminidase\t\tbeta-N-acetylhexosaminidase-like domain\tCoggill P\tpcc\tThis superfamily is characterised by a mixed beta sheet with connection over the free side of the sheet. The fold is like a zincin fold lacking the catalytic centre.\t2012-04-30 14:53:24\t2012-04-30 15:53:24\t1\t124\t150\t1212\t3278\t1\n+547\tCL0547\tGF_recep_C-rich\t\tGrowth factor receptor Cys-rich\tEberhardt R\tre3\tThe cysteine-rich regions of growth factor receptor tyrosine kinases consist of eight disulphide-linked modules [1].\t2012-05-01 09:19:56\t2012-05-01 10:19:56\t1\t49\t88\t140\t2125\t1\n+548\tCL0548\tIHF-likeDNA-bdg\t\tIHF-like DNA-binding protein supewrfamily\tCoggill P\tpcc\tThis superfamily is characterised by being a dimer of identical subunits of a core of four helices in a bundle, partly opened, capped with a beta-sheet. All members appear to be prokaryotic DNA-binding domains.\t2012-05-01 16:00:04\t2012-05-01 17:00:04\t1\t60\t13\t4459\t12213\t1\n+549\tCL0549\tNicO_HupE_DsbD\t\tNicO/HupE/DsdD superfamily\tEberhardt R\tre3\tThis clan contains the nickel transpot family NicO [1-2] and the HupE/UreJ proteins, which may be involved in nickel binding. NicO and HupE contain a conserved GxxHxxxDH motif, which may bind to nickel.\t2012-05-31 10:50:16\t2012-05-31 11:50:16\t1\t0\t55\t3953\t10451\t1\n+550\tCL0550\tSRCR\t\tSRCR-like\tEberhardt R\tre3\t\t2012-07-11 13:20:22\t2012-07-11 14:20:22\t1\t19\t402\t139\t9451\t1\n+551\tCL0551\tBCLiA\t\tBcl-2 inhibitors of programmed cell death\tCoggill P\tpcc\tThis superfamily is characterised by families of proteins that inhibit apoptosis, They are regulated by all BH3-only proteins to promote apoptosis.\t2012-08-31 14:55:14\t2012-08-31 15:55:14\t1\t151\t28\t348\t1402\t1\n+552\tCL0552\tHect\t\tHect, E3 ligase catalytic domain\tCoggill P\tpcc\tThis superfamily is characterised by fmailies with E3-ligase catalytic acitvity.  The fold consists of two alpha+beta domains; where the N-terminal domain is an array of helices and beta-hairpins; the C-terminal domain is an a/b sandwich with one left-handed beta-alpha(n)-beta unit.\t2012-09-07 15:42:50\t2012-09-07 16:42:50\t1\t23\t249\t342\t4684\t1\n+553\tCL0553\tHBMR\t\tHelical backbone metal receptor superfamily\tCoggill P\tpcc\tThis superfamily is characterised by a long alpha helical insertion in the interdomain linker in the Chelatase-like fold. Representative families include the periplasmic ferric siderophore binding proteins, the TroA-like nitrogenase iron-molybdenum proteins and the putative iron(III) transporter family, TM0189-like, or periplasmic-binding family.\t2012-09-12 10:08:39\t2012-09-12 11:08:39\t1\t190\t51\t4828\t19772\t1\n'
b
diff -r 000000000000 -r 68a3648c7d91 pfam_annot/f1
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pfam_annot/f1 Thu Dec 22 04:45:31 2016 -0500
[
b"@@ -0,0 +1,14831 @@\n+PF00389 2-Hacid_DH; <br>D-isomer specific 2-hydroxyacid dehydrogenase, catalytic domain. This family represents the largest portion of the catalytic domain  of 2-hydroxyacid dehydrogenases as the NAD binding domain is  inserted within the structural domain.. \n+PF00198 2-oxoacid dehydrogenases acyltransferase (catalytic domain)<br>These proteins contain one to three copies of a lipoyl binding domain followed by the catalytic domain.. \n+PF04029 2-phosphosulpholactate phosphatase<br>Thought to catalyse 2-phosphosulpholactate = sulpholactate + phosphate. Probable magnesium cofactor.  Involved in the second step of coenzyme M biosynthesis.  Inhibited by vanadate in Methanococcus jannaschii.  Also known as the ComB family  .. \n+PF03171 2OG-Fe(II) oxygenase superfamily<br>This family contains members of the 2-oxoglutarate (2OG) and Fe(II)-dependent oxygenase superfamily  .  This family includes the C-terminal of prolyl 4-hydroxylase  alpha subunit.  The holoenzyme has the activity EC:1.14.11.2  catalysing the reaction: Procollagen L-proline + 2-oxoglutarate + O2 <=> procollagen trans- 4-hydroxy-L-proline + succinate + CO2.  The full enzyme consists of a alpha2 beta2 complex with the alpha subunit contributing most of the parts of the active site  . The family also includes lysyl hydrolases, isopenicillin synthases and AlkB.. \n+PF01073 3-beta hydroxysteroid dehydrogenase/isomerase family<br>Pfam-B_504 (release 3.0). The enzyme 3 beta-hydroxysteroid dehydrogenase/5-ene-4-ene  isomerase (3 beta-HSD) catalyses the oxidation and isomerisation  of 5-ene-3 beta-hydroxypregnene and 5-ene-hydroxyandrostene  steroid precursors into the corresponding 4-ene-ketosteroids necessary for the formation of all classes of steroid hormones.. \n+PF04419 4F5 protein family<br>Members of this family are short proteins that are rich in aspartate, glutamate, lysine and arginine. Although the function of these proteins is unknown, they are found to be ubiquitously expressed  .. \n+PF03061 Thioesterase superfamily<br>Pfam-B_2758 (release 6.4). This family contains a wide variety of enzymes, principally thioesterases. This family includes 4HBT (EC 3.1.2.23) which catalyses the final step in the  biosynthesis of 4-hydroxybenzoate from 4-chlorobenzoate in the soil dwelling microbe Pseudomonas CBS-3. This family includes various cytosolic long-chain acyl-CoA  thioester hydrolases. Long-chain acyl-CoA hydrolases hydrolyse palmitoyl-CoA to CoA and palmitate, they also catalyse the hydrolysis of other  long chain fatty acyl-CoA thioesters.. \n+PF02872 5_nucleotidaseC; <br>5'-nucleotidase, C-terminal domain. Pfam-B_1318 (release 3.0). \n+PF00003 7 transmembrane sweet-taste receptor of 3 GCPR<br>This is a domain of seven transmembrane regions that forms the C-terminus of some subclass 3 G-coupled-protein receptors. It is often associated with a downstream cysteine-rich linker domain, NCD3G Pfam:PF07562, which is the human sweet-taste receptor, and the N-terminal domain, ANF_receptor Pfam:PF01094. The seven TM regions assemble in such a way as to produce a docking pocket into which such molecules as cyclamate and lactisole have been found to bind and consequently confer the taste of sweetness  .. \n+PF01661 DUF27;A1pp; <br>Pfam-B_434 (release 4.1). This domain is an ADP-ribose binding module.  It is found in a number of otherwise unrelated proteins.  It is found at the C-terminus of the macro-H2A histone protein Swiss:Q02874. This domain is found in the non-structural proteins of several types of ssRNA viruses such as NSP3 from alphaviruses Swiss:P03317. This domain is also found on its own in a family of proteins from bacteria Swiss:P75918, archaebacteria Swiss:O59182 and eukaryotes Swiss:Q17432.. \n+PF02177 A4_EXTRA;<br>Amyloid A4 N-terminal heparin-binding. Alignment kindly provided by SMART. This N-terminal domain of APP, amyloid precursor protein, is the heparin-binding domain of the protein. this region is also responsible for stimulation of neurit"..b' type 2, type 7 or PrsW-peptidase dependent secretion system  .. \n+PF15647 Restriction endonuclease fold toxin 3<br>A predicted toxin of the restriction endonuclease fold present in bacterial polymorphic toxin systems. In bacterial polymorphic toxin systems, the toxin is exported by the type 2, type 6, type 7 or PrsW-peptidase dependent secretion system  .. \n+PF15648 Restriction endonuclease fold toxin 5<br>A predicted toxin of the restriction endonuclease fold present in bacterial polymorphic toxin systems. In bacterial polymorphic toxin systems, the toxin is exported by the type 2, type 5, type 6, or PrsW-peptidase dependent secretion system. Versions of this domain are also found in caudoviruses  .. \n+PF15649 Restriction endonuclease fold toxin 7<br>A predicted toxin of the restriction endonuclease fold present in bacterial polymorphic toxin systems. In bacterial polymorphic toxin systems, the toxin is exported by the type 2, type 5, type 6, or type 7 secretion system  .. \n+PF15650 Restriction endonuclease fold toxin 9<br>A predicted toxin of the restriction endonuclease fold present in bacterial polymorphic toxin systems. In bacterial polymorphic toxin systems, the toxin is exported by the type 2 or type 7 secretion system  .. \n+PF15651 Salivary glad secreted protein domain toxin<br>An alpha+beta fold domain with four conserved cysteine residues and a conserved [DE}xx[ND] motif. This domain is mainly present at the c-terminus of RHS repeats containing proteins in insects and crustaceans. Although no bacterial homologs have been identified, the domain architecture suggests an origin from bacterial polymorphic toxin systems  . . \n+PF15652 HNH/Endo VII superfamily toxin with a SHH signature<br>A predicted toxin of the HNH/Endonuclease VII fold present in bacterial polymorphic toxin systems with two conserved histidine residues. In bacterial polymorphic toxin systems, the toxin is exported by the type 2, type 5, type 6 or type 7 secretion system  .. \n+PF15653 URI fold toxin 2<br>A predicted toxin of the URI nuclease fold present in bacterial polymorphic toxin systems. In bacterial polymorphic toxin systems, the toxin is exported by the type 2 or type 6 secretion system  .. \n+PF15654 Toxin with a conserved tryptophan and TIP tripeptide motif<br>A predicted toxin domain with two membrane spanning alpha helices and RxxR, Wx[ST]IP motifs. The domain is present in bacterial polymorphic toxin systems. The toxin is usually exported by the type 2 or Photorhabdus virulence cassette (PVC)-type secretion system  .. \n+PF15655 NTF2 fold immunity protein<br>A predicted immunity protein of the NTF2 fold. Proteins containing this domain are present in bacterial polymorphic toxin systems as an immediate gene neighbor of the toxin gene, which usually contains toxin domains of the Tox-JAB-2 family  .. \n+PF15656 Toxin with a H, D/N and C signature<br>A predicted alpha/beta fold peptidase domain with a strongly conserved triad of a histidine, aspartate/asparagine and cysteine residues that are predicted to comprise the active site of the predicted peptidase. Proteins bearing this predicted toxin domain are particularly common in both intracellular and extracellular pathogens  .. \n+PF15657 HNH/Endo VII superfamily nuclease toxins<br>A predicted toxin of the HNH/Endonuclease VII fold present in bacterial polymorphic toxin systems with a characteristic conserved [ED]H motif and two histidine residues. In bacterial polymorphic toxin systems, the toxin is exported by the type 2, type 5, type 6, type 7 or Photorhabdus virulence cassette (PVC)-type secretion system  .. \n+PF15658 Latrotoxin C-terminal domain<br>A toxin domain present in arthropod alphaproteobacterial, gammaproteobacterial endosymbionts and also at the C-termini of the latrotoxins of the black widow spider. The domain is characterized by a conserved, hydrophobic helix and is predicted to associate with the cell membrane  .. \n+PF15659 JAB-like toxin  1<br>\n+PF15660 Immunity protein 49<br>\n'
b
diff -r 000000000000 -r 68a3648c7d91 pfam_annot/pfamA.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pfam_annot/pfamA.txt Thu Dec 22 04:45:31 2016 -0500
[
b'@@ -0,0 +1,14831 @@\n+1\tPF00389\t2-Hacid_dh\t2-Hacid_DH; \tD-isomer specific 2-hydroxyacid dehydrogenase, catalytic domain\tFinn RD, Griffiths-Jones SR\tanon\tProsite\tDomain\tThis family represents the largest portion of the catalytic domain  of 2-hydroxyacid dehydrogenases as the NAD binding domain is  inserted within the structural domain.\t24.60\t24.60\t24.60\t24.60\t24.50\t24.50\thmmbuild  -o /dev/null --hand HMM SEED\t133\thmmsearch -Z 23193494 -E 1000 --cpu 4 HMM pfamseq\t0.71\t-9.58\t0.71\t-10.50\t0.71\t-4.69\t98\t16860\t2012-10-02 14:31:05\t2003-04-07 12:59:11\t25\t50\t4524\t180\t4691\t12854\t6366\t308.20\t17\t85.12\tCHANGED\tlllhp....sh..pptshphlcc.........plphtp....shsp-..clhcthps..s-ulhstsps.....plspcll.pth..spLKlluptusGhDslDlcsAsc+GIhVsNsPs.ssspulAEhsluhllulsRclspspppl+pGpapppthhshphtspshsllGhsthGttssthtpthththhhhshhhs.pttpttthhhht.thhh...................psspllshps.tshppthhhtpptttthhsshhlsssttsshhststhtshtpptthsssshssppp.sshtp.LhshsNVllTPHluus..TpEAppshutpsspslhphhpG....psstssVs\t.......................................................................................h.t.hpt..............h..pl.phhp.............hspp....ph..h....p...t..l...t.s...........s..-..s.l.h..s.p.st.s...........l..s.p..c..ll...pth............spL+..hl.u....p....h.u...s.G....h....-....s..l....D....l.s.s....A...p.c..........p...........G...........IhV............s........N.s......P.s......s...s..s....p......u.....V....A..EhsluhlLshsRplspsppph+pGpapttshhshthtstphtslshsthsthhtthtpthththhhh.hh.ttttttttthh..t..h..........................................................................................................................................................tst.hh.htshts.ppthhhspphhthhhsshhlhssttsshhppshhtshhpttthsssshss.tptsshtpsLhphsNVllTPHluus..T.tE.up.pp.hs.p...t.s.hpslh.p.hh.ps......t....p.l..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................\t0\t1361\t2755\t3878\n+2\tPF00198\t2-oxoacid_dh\t\t2-oxoacid dehydrogenases acyltransferase (catalytic domain)\tBateman A, Finn RD, Griffiths-Jones SR\tanon\tBateman A\tDomain\tThese proteins contain one to three copies of a lipoyl binding domain followed by the catalytic domain.\t23.00\t23.00\t23.10\t23.50\t22.90\t22.90\thmmbuild  -o /dev/null HMM SEED\t231\thmmsearch -Z 23193494 -E 1000 --cpu 4 HMM pfamseq\t0.70\t-10.60\t0.70\t-11.30\t0.70\t-5.09\t100\t10039\t2012-10-02 12:01:53\t2003-04-07 12:59:11\t18\t49\t3997\t65\t2920\t7770\t5825\t224.80\t35\t47.20\tCHANGED\tssstpplPlsshR+slAcphspStp.shPphshs...s-l-hspLhplRp.p..................lppph..tt........KlohhshllKAsuhALccaPhlNush..s..s-s.....llhpcplslGlAVsos.....pG..LlVPVl+sscptulhplupclpcLsp+ARss+.Lpss-hpG.GTFTlSNlGsh.G.sphhoPIIN..PpsAILulGpl.pcpP.ls.tssp.........lshpph.....hsloLohDHRllDGAsuucFlppl+.chLE..sPttll.l\t.......................................................t....ppl.h.stlR+tlApphhcuhp.ss.s..p...lThh...s-lDhst...lhshRp.p.......................................hpp.t.htcpps..............KLohhsahsKA.lstAL..+..c.a.P.tlNuuh.........s...............scs........llh+ph...hs..lG.......lAV..s...T..s............pG.....L.lV..P...V.l+..ss...-.p.h.ultpls.p.........c.l...pcLup....+.......AR.cG...K..Ls.....s...p-h..pG...GTFTI....oNhG...s........h....G..uh..h..T.PI....l.Ns..Pps.A.I.L.G.luph..tp+......P.ls....hssp................................lshcsh...h.LuLSaDHRllDGtpuu.pFLspl+.phLE..sPtthl.............................\t0\t940\t1801\t2459\n+3\tPF04029\t2-ph_phosp\t\t2-phosphosulpholactate phosphatase\tKerrison ND, Finn RD\tanon\tCOG2045\tFamily\tThought to catalyse 2-phosphosulpholactate = sulpholactate + phosphate. Probable magnesium cofactor.  Involved in the second step of coenzyme M biosynthesis.  Inhibited by vanadate in Methanococcu'..b'th intracellular and extracellular pathogens [1].\t25.00\t25.00\t28.30\t27.40\t20.40\t19.30\thmmbuild  -o /dev/null HMM SEED\t119\thmmsearch -Z 23193494 -E 1000 --cpu 4 HMM pfamseq\t0.71\t-9.62\t0.71\t-10.73\t0.71\t-3.90\t9\t20\t2012-10-09 19:11:59\t2012-10-09 20:11:59\t1\t5\t16\t0\t5\t19\t0\t116.40\t28\t18.40\tNEW\ttlaGscsphtshc.lptslpsIs+p.S..spsIhIlSGoHGhssG.........pNWhtps.........lRcPpl.h-hpFhtpDhpshpt....hscplhlhDlssss.tchss..lpss......ssphILuYCaStsDpsht.\t.........................lhstptth.sht.lptshphlt+p.s.sstcIhIlSGoHGtssG.........pNasups...............lRcPsl.hE+tFahEDlpsap.t...........hstpV+lhDlushop.pEhss.plpss......spplIhGaCaSpsDchhh.h....\t0\t1\t2\t4\n+15517\tPF15657\tTox-HNH-EHHH\t\tHNH/Endo VII superfamily nuclease toxins\tZhang D, de Souza RF, Anantharaman V, Iyer LM, Aravind L, Finn RD\trdf\t[1]\tFamily\tA predicted toxin of the HNH/Endonuclease VII fold present in bacterial polymorphic toxin systems with a characteristic conserved [ED]H motif and two histidine residues. In bacterial polymorphic toxin systems, the toxin is exported by the type 2, type 5, type 6, type 7 or Photorhabdus virulence cassette (PVC)-type secretion system [1].\t25.00\t25.00\t26.70\t26.60\t23.40\t23.40\thmmbuild  -o /dev/null HMM SEED\t72\thmmsearch -Z 23193494 -E 1000 --cpu 4 HMM pfamseq\t0.72\t-8.75\t0.72\t-9.70\t0.72\t-3.83\t33\t80\t2012-10-09 19:15:32\t2012-10-09 20:15:32\t1\t26\t69\t0\t26\t75\t0\t66.90\t30\t8.65\tNEW\ttlhcpsspshhsRpYcFpsscG.....ppllIp-HotGHths.....tspsPHFNsts.cs...........................hpsGphsspcsHYsa\t.......hhsspspshhsR.acapshcG.........pKhlIp-HotG+th.......spuPHapsts.pp............................h.c.puphstp.psHY.a.......................\t0\t2\t9\t18\n+15518\tPF15658\tLatrotoxin_C\t\tLatrotoxin C-terminal domain\tZhang D, de Souza RF, Anantharaman V, Iyer LM, Aravind L, Finn RD\trdf\t[1]\tFamily\tA toxin domain present in arthropod alphaproteobacterial, gammaproteobacterial endosymbionts and also at the C-termini of the latrotoxins of the black widow spider. The domain is characterized by a conserved, hydrophobic helix and is predicted to associate with the cell membrane [1].\t25.00\t25.00\t46.20\t43.40\t24.70\t22.60\thmmbuild  -o /dev/null HMM SEED\t127\thmmsearch -Z 23193494 -E 1000 --cpu 4 HMM pfamseq\t0.71\t-9.89\t0.71\t-10.46\t0.71\t-4.31\t8\t23\t2012-10-09 19:27:40\t2012-10-09 20:27:40\t1\t16\t8\t0\t8\t25\t0\t126.00\t38\t6.65\tNEW\tphDsNuslhLLDlLIRKlTspKYhsT....sc.polSPLEApGYALsIsKcFEcVlc.QAulKSGISh+cLNIDhlElQcpIssK..ItSGKFsEISulLsSYlEcAhPs......ucLS.Kph-KFhspFNscl-.....slLNp\t......hDVNGslhLLDlLIRKhoupKYhss....sc....pSISsLEAQuYALsIsc+FEcVLp.psulKuGlShcpLNlDhstlQpcIhtK..lhuG+FsEIuthLsSasccAhPt......upLp.KphcKFh.pFppthc.hlp.............\t0\t0\t2\t2\n+15519\tPF15659\tToxin-JAB1\t\tJAB-like toxin  1\tZhang D, de Souza RF, Anantharaman V, Iyer LM, Aravind L, Finn RD\trdf\t[1]\tFamily\t\\N\t26.40\t26.40\t26.40\t26.60\t25.60\t25.70\thmmbuild  -o /dev/null HMM SEED\t162\thmmsearch -Z 23193494 -E 1000 --cpu 4 HMM pfamseq\t0.71\t-10.07\t0.71\t-11.36\t0.71\t-4.36\t13\t24\t2012-10-10 14:49:21\t2012-10-09 20:38:55\t1\t2\t16\t0\t6\t24\t1\t153.20\t26\t34.55\tNEW\tp+ss+las....phtsusps-phhssp.ts..psIpl.spsll...uphsp..tpsppG....thpoas....oTssspsAtslFcFsA-NTo..VEWpLsshp.-sGs.pshsltTspcptuspss..hsph.pc.htssuphlhc..IHSH...........Ptss.....tsS.....hsDhphup.tps..........huhYhpc.......tthhphYs\t......................ppssclhp.....hhpssps-phh.hp.ts..psIpl.sppll...sthhp..tpsppG....hhpoas....sTss.csAtslFcFsA-NTo..VEWpLssht.-p....Gs.pshhlsTsppppuVt....hs.htpc.h.stsphlIc..hHSH...........Phss....tsS.....spDhpshp.hps...........shahcc........t.h..Y................\t0\t6\t6\t6\n+15520\tPF15660\tImm49\t\tImmunity protein 49\tZhang D, de Souza RF, Anantharaman V, Iyer LM, Aravind L, Finn RD\trdf\t[1]\tFamily\t\\N\t25.00\t25.00\t168.90\t168.80\t24.80\t19.60\thmmbuild  -o /dev/null HMM SEED\t84\thmmsearch -Z 23193494 -E 1000 --cpu 4 HMM pfamseq\t0.72\t-9.17\t0.72\t-9.79\t0.72\t-3.94\t4\t10\t2012-10-09 20:02:34\t2012-10-09 21:02:34\t1\t1\t10\t0\t0\t10\t0\t82.90\t99\t96.96\tNEW\thRALVALKRELLPGVTTFIDSVRLEAIDDKADRLMVTTSVGEEARLVYFNPDFAGTPTFGRRLYRLRDWTDDLADWVDRLRRER\tVRALVALKRELLPGVTTFIDSVRLEAIDDKADRLMVTTSVGEEARLVYFNPDFAGTPTFGRRLYRLRDWTDDLADWVDRLRRER\t0\t0\t0\t0\n'
b
diff -r 000000000000 -r 68a3648c7d91 pfam_annot/pfamA.txt.gz
b
Binary file pfam_annot/pfamA.txt.gz has changed
b
diff -r 000000000000 -r 68a3648c7d91 pfam_annot/pfam_annot.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pfam_annot/pfam_annot.xml Thu Dec 22 04:45:31 2016 -0500
b
@@ -0,0 +1,13 @@
+<tool id="pfam_Annot" name="PFAM output annotator" version="0.">
+ <description>Generate synthetic reports</description>
+ <command> /home/inmare/galaxy/tools/pfam_annot/annota.pl $prot $pfam $out</command>
+ <inputs>
+  <param name="prot" type="data" format="fasta" label="protein file used as input with pfamScan" help="fasta only"/>
+  <param name="pfam" type="data" format="tabular" label="pfamScan output" help="this file should match the protein file"/>
+ </inputs>
+ <outputs>
+  <data name="out" ftype="tabular" format="html" label="annotated pfam file"/>
+ </outputs>
+ <test/>
+ <help> This tools produce a synthetic report, in fasta format, containing a description of PFAM domains annotated to each protein. A separate report is generated for each fosmid. Fosmid names are inferred from the heades of the input fasta file, using the naming convention described in the "Sanger Fosmid End" annotator tool.  Therefore contigs should be assigned to fosmids prior to perform this step and ideally prior to the annotation of functional domains. The output file contains links to the PFAM wiki, where a more comprehensive description of the domains can be found. A pfamScan output table is required as input.  The output of this tool is compatible with the PFAM search utility, which might be used in order to perform keyword searches on the annotation  </help>
+</tool>
b
diff -r 000000000000 -r 68a3648c7d91 pfam_annot/pro
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pfam_annot/pro Thu Dec 22 04:45:31 2016 -0500
[
b'@@ -0,0 +1,646 @@\n+<html>\n+<head></head>\n+<body>\n+Proteins with PFAM domains:\n+<br><br>\n+<div>\n+<table cellpadding="0" width=650>\n+<td>\n+<HR SIZE=3 WIDTH=80%>\n+<center><b>PROKKA_00001</b><br>\n+</center>\n+</td\n+<tr></tr>\n+<td bgcolor="#czb9dz">\n+<pre> \n+MENNLENLTIGVFAKAAGVNVETIRFYQRKGLLREPDKPYGSIRRYGEADVVRVKFVKSAQRLGFSLDEIAELLRLDDGTHCEEASSLAE\n+HKLKDVREKMADLARMETVLSELVCACHARKGNVSCPLIASLQGEAGLARSAMP\n+\n+ </pre>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<p align="left">\n+<a href=http://pfam.xfam.org/family/PF00376> PF00376</a>\n+<p align="justify">merR; <br>MerR family regulatory protein. Prosite & Pfam-B_3021 (Release 7.5). </p>\n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td bgcolor="#czb9dz">\n+<p align="left">\n+<a href=http://pfam.xfam.org/clan/CL0123> CL0123</a>\n+ <p align="justify">This family contains a diverse range of mostly DNA-binding domains that contain a helix-turn-helix motif. </p>\n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<p align="left">\n+<a href=http://pfam.xfam.org/family/PF09278> PF09278</a>\n+<p align="justify">MerR, DNA binding<br>Members of this family of DNA-binding domains are predominantly found in the prokaryotic transcriptional regulator MerR. They adopt a structure consisting of a core of three alpha helices, with an architecture that is similar to that of the \'winged helix\' fold  .. </p>\n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td bgcolor="#czb9dz">\n+<p align="left">\n+<a href=http://pfam.xfam.org/clan/CL0123> CL0123</a>\n+ <p align="justify">This family contains a diverse range of mostly DNA-binding domains that contain a helix-turn-helix motif. </p>\n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<HR SIZE=3 WIDTH=80%>\n+<center><b>PROKKA_00002</b><br>\n+</center>\n+</td\n+<tr></tr>\n+<td bgcolor="#czb9dz">\n+<pre> \n+MSEPQNGRGALFAGGLAAILASTCCLGPLVLVALGFSGAWIGNLTVLEPYRPLFIGAALVALFFAWKRIYRPVQACKPGEVCAIPQVRAT\n+YKLIFWIVAVLVLVALGFPYVVPFFY\n+\n+ </pre>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<p align="left">\n+<a href=http://pfam.xfam.org/family/PF02411> PF02411</a>\n+<p align="justify">MerT mercuric transport protein<br>Pfam-B_1796 (release 5.4). MerT is an mercuric transport integral membrane protein and  is responsible for transport of the Hg2+ iron from periplasmic  MerP (also part of the transport system) to mercuric reductase (MerE).. </p>\n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<HR SIZE=3 WIDTH=80%>\n+<center><b>PROKKA_00003</b><br>\n+</center>\n+</td\n+<tr></tr>\n+<td bgcolor="#czb9dz">\n+<pre> \n+MKKLFASLALAAAVAPVWAATQTVTLAVPGMTCAACPITVKKALSKVEGVSKVDVGFEKREAVVTFDDTKASVQKLTKATADAGYPSSVK\n+Q\n+\n+ </pre>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<p align="left">\n+<a href=http://pfam.xfam.org/family/PF00403> PF00403</a>\n+<p align="justify">Heavy-metal-associated domain<br></p>\n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<HR SIZE=3 WIDTH=80%>\n+<center><b>PROKKA_00004</b><br>\n+</center>\n+</td\n+<tr></tr>\n+<td bgcolor="#czb9dz">\n+<pre> \n+MGLMTRIADKTGALGSVVSAMGCAACFPALASFGAAIGLGFLSQYEGLFISRLLPLFAALAFLANALGWFSHRQWLRSLLGMIGPAIVFA\n+ATVWLLGNWWTANLMYVGLALMIGVSIWDFVSPAHRRCGPDGCELPAKRL\n+\n+ </pre>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<p align="left">\n+<a href=http://pfam.xfam.org/family/PF03203> PF03203</a>\n+<p align="justify">MerC mercury resistance protein<br>Pfam-B_2720 (release 6.5). </p>\n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<HR SIZE=3 WIDTH=80%>\n+<center><b>PROKKA_00005</b><br>\n+</center>\n+</td\n+<tr></tr>\n+<td bgcolor="#czb9dz">\n+<pre> \n+MSTLKITGMTCDSCAVHVKDALEKVPGVQSADVSYAKGSAKLAIEVGTSPDALTAAVAGLGYRATLADAPSVSTPGGLLDKMRDLLGRND\n+KTGSSGALHIAVIGSGGAAMAAALKAVEQGARVTLIERGTIGGTCVNVGCVPSKIMIRAAHIAHLRRESPFDGGIAATTPTIQRTALLAQ\n+QQARVDELRHAKYEGILEGNPAITVLHGSARFKDNRNLIVQLNDGGERVVAFDRCLIATGASPAVPPIPGLKDTPYWTSTEALVSETIPK\n+RLAVIGSSVVALELAQAFARLGAKVTILARSTLFFREDPAIGEAVTAAFRMEGIEVREHTQASQVAYINGEGDGEFVLTTAHGELRADKL\n+LVATGRAPNTRKLALDATGVTLTPQGAIVIDPGMRTSVEHIYAAGDCTDQPQFVYVAAAAGTRAAINMTGGDAALNLTAMPAVVFTDPQV\n+ATVGYSEAEAHHDGIKTDSRTLTLDNVPRALANFDTRGFIK'..b'01842> PF01842</a>\n+<p align="justify">ACT domain<br>This family of domains generally have a regulatory role. ACT domains are linked to a wide range of metabolic enzymes that are regulated by amino acid concentration. Pairs of ACT domains bind specifically to a particular amino acid leading to regulation of the linked enzyme.  The ACT domain is found in: D-3-phosphoglycerate dehydrogenase EC:1.1.1.95 Swiss:P08328, which is inhibited by serine  . Aspartokinase EC:2.7.2.4 Swiss:P53553, which is regulated by lysine. Acetolactate synthase small regulatory subunit Swiss:P00894, which is inhibited by valine. Phenylalanine-4-hydroxylase EC:1.14.16.1 Swiss:P00439, which is regulated by phenylalanine. Prephenate dehydrogenase EC:4.2.1.51 Swiss:P21203. formyltetrahydrofolate deformylase EC:3.5.1.10, Swiss:P37051, which is activated by methionine and inhibited by glycine. GTP pyrophosphokinase EC:2.7.6.5 Swiss:P11585. </p>\n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td bgcolor="#czb9dz">\n+<p align="left">\n+<a href=http://pfam.xfam.org/clan/CL0070> CL0070</a>\n+ <p align="justify">These domains are involved in binding to amino-acids and causing allosteric regulation of linked enzyme domains  . The relationship between these two families was first noticed in  . </p>\n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<HR SIZE=3 WIDTH=80%>\n+<center><b>PROKKA_00016</b><br>\n+</center>\n+</td\n+<tr></tr>\n+<td bgcolor="#czb9dz">\n+<pre> \n+MSDISRVKILSALMDGRAWTATELSSVANISASTASSHLSKLLDCQLITVVAQGKHRYFRLAGKDIAELMESMMGISLNHGVHARVSTPV\n+HLRKARTCYDHLAGEVAVKIYDSLCQQQWITENGSMITLSGIQYFHEMGIDVPSKHSRKICCACLDWSERRFHLGGYVGAALFSLYESKG\n+WLTRHLGYREVTITEKGYAAFKTHFHI\n+\n+ </pre>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<p align="left">\n+<a href=http://pfam.xfam.org/family/PF12840> PF12840</a>\n+<p align="justify">Helix-turn-helix domain<br>This domain represents a DNA-binding Helix-turn-helix domain found in transcriptional regulatory proteins.. </p>\n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td bgcolor="#czb9dz">\n+<p align="left">\n+<a href=http://pfam.xfam.org/clan/CL0123> CL0123</a>\n+ <p align="justify">This family contains a diverse range of mostly DNA-binding domains that contain a helix-turn-helix motif. </p>\n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<HR SIZE=3 WIDTH=80%>\n+<center><b>PROKKA_00017</b><br>\n+</center>\n+</td\n+<tr></tr>\n+<td bgcolor="#czb9dz">\n+<pre> \n+MSRLDKSKVINSALELLNEVGIEGLTTRKLAQKLGVEQPTLYWHVKNKRALLDALAIEMLDRHHTHFCPLEGESWQDFLRNNAKSFRCAL\n+LSHRDGAKVHLGTRPTEKQYETLENQLAFLCQQGFSLENALYALSAVGHFTLGCVLEDQEHQVAKEERETPTTDSMPPLLRQAIELFDHQ\n+GAEPAFLFGLELIICGLEKQLKCESGS\n+\n+ </pre>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<p align="left">\n+<a href=http://pfam.xfam.org/family/PF00440> PF00440</a>\n+<p align="justify">tetR; <br>Bacterial regulatory proteins, tetR family. </p>\n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td bgcolor="#czb9dz">\n+<p align="left">\n+<a href=http://pfam.xfam.org/clan/CL0123> CL0123</a>\n+ <p align="justify">This family contains a diverse range of mostly DNA-binding domains that contain a helix-turn-helix motif. </p>\n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<p align="left">\n+<a href=http://pfam.xfam.org/family/PF02909> PF02909</a>\n+<p align="justify">tetR_C; <br>Tetracyclin repressor, C-terminal all-alpha domain. </p>\n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td bgcolor="#czb9dz">\n+<p align="left">\n+<a href=http://pfam.xfam.org/clan/CL0174> CL0174</a>\n+ <p align="justify">TetR protein, C-terminal domain-like This clan features families of transcriptional regulators for multidrug efflux pumps, which belong to the TetR superfamily. They are induced by the presence of a variety of factors, such as antibiotics or organic solvents. The C-terminal region featured in these families is thought to contain the inducer-binding site; the divergent sequences in this region allow for the binding of a variety of different inducers [1-4]. </p>\n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n'
b
diff -r 000000000000 -r 68a3648c7d91 pfam_annot/prots
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pfam_annot/prots Thu Dec 22 04:45:31 2016 -0500
b
@@ -0,0 +1,80 @@
+>PROKKA_00001 Mercuric resistance operon regulatory protein
+MENNLENLTIGVFAKAAGVNVETIRFYQRKGLLREPDKPYGSIRRYGEADVVRVKFVKSA
+QRLGFSLDEIAELLRLDDGTHCEEASSLAEHKLKDVREKMADLARMETVLSELVCACHAR
+KGNVSCPLIASLQGEAGLARSAMP*
+>PROKKA_00002 MerT mercuric transport protein
+MSEPQNGRGALFAGGLAAILASTCCLGPLVLVALGFSGAWIGNLTVLEPYRPLFIGAALV
+ALFFAWKRIYRPVQACKPGEVCAIPQVRATYKLIFWIVAVLVLVALGFPYVVPFFY*
+>PROKKA_00003 Mercuric transport protein periplasmic component precursor
+MKKLFASLALAAAVAPVWAATQTVTLAVPGMTCAACPITVKKALSKVEGVSKVDVGFEKR
+EAVVTFDDTKASVQKLTKATADAGYPSSVKQ*
+>PROKKA_00004 Mercuric resistance protein MerC
+MGLMTRIADKTGALGSVVSAMGCAACFPALASFGAAIGLGFLSQYEGLFISRLLPLFAAL
+AFLANALGWFSHRQWLRSLLGMIGPAIVFAATVWLLGNWWTANLMYVGLALMIGVSIWDF
+VSPAHRRCGPDGCELPAKRL*
+>PROKKA_00005 Mercuric reductase
+MSTLKITGMTCDSCAVHVKDALEKVPGVQSADVSYAKGSAKLAIEVGTSPDALTAAVAGL
+GYRATLADAPSVSTPGGLLDKMRDLLGRNDKTGSSGALHIAVIGSGGAAMAAALKAVEQG
+ARVTLIERGTIGGTCVNVGCVPSKIMIRAAHIAHLRRESPFDGGIAATTPTIQRTALLAQ
+QQARVDELRHAKYEGILEGNPAITVLHGSARFKDNRNLIVQLNDGGERVVAFDRCLIATG
+ASPAVPPIPGLKDTPYWTSTEALVSETIPKRLAVIGSSVVALELAQAFARLGAKVTILAR
+STLFFREDPAIGEAVTAAFRMEGIEVREHTQASQVAYINGEGDGEFVLTTAHGELRADKL
+LVATGRAPNTRKLALDATGVTLTPQGAIVIDPGMRTSVEHIYAAGDCTDQPQFVYVAAAA
+GTRAAINMTGGDAALNLTAMPAVVFTDPQVATVGYSEAEAHHDGIKTDSRTLTLDNVPRA
+LANFDTRGFIKLVVEEGSGRLIGVQAVAPEAGELIQTAALAIRNRMTVQELADQLFPYLT
+MVEGLKLAAQTFNKDVKQLSCCAG*
+>PROKKA_00006 zinc-responsive transcriptional regulator
+MSAYTVSQLAHNAGVSVHIVRDYLVRGLLRPVACTTGGYGVFDDAALQRLCFVRAAFEAG
+IGLDALARLCRALDAADGAQAAAQLAVLRQLVERRRAALAHLDAQLASMPAERAHEEALP
+*
+>PROKKA_00007 MerE protein
+VNAPDKLPPETRQPVSGYLWGALAVLTCPCHLPILAAVLAGTTAGAFLGEHWGVAALALT
+GLFVLAVTRLLRAFRGGS*
+>PROKKA_00008 Phytochrome-like protein cph2
+MTSSQPAGWTAAELAQAAARGQLDLHYQPLVDLRDHRIAGAEALMRWRHPRLGLLPPGQF
+LPLAESFGLMPEIGAWVLGEACRQMHKWQGPAWQPFRLAINVSASQVGPTFDDEVKRVLA
+DMALPAELLEIELTESVAFGNPALFASFDALRAIGVRFAADDFGTGYSCLQHLKCCPITT
+LKIDQSFVARLPDDARDQTIVRAVIQLAHGLGMDVIFRRRLHQLIGRNGCCAASS*
+>PROKKA_00009 Transposon Tn7 transposition protein TnsB
+MATDTPRIPEQGVATLPDEAWERARRRAEIISPLAQSETVGHEAADMAAQALGLSRRQVY
+VLIRRARQGSGLVTDLVPGQSGGGKGKGRLPEPVERVIHELLQKRFLTKQKRSLAAFHRE
+VTQVCKAQKLRVPARNTVALRIASLDPRKVIRRREGQDAARDLQGVGGEPPAVTAPLEQV
+QIDHTVIDLIVVDDRDRQPIGRPYLTLAIDVFTRCVLGMVVTLEAPSAVSVGLCLVHVAC
+DKRPWLEGLNVEMDWQMSGKPLLLYLDNAAEFKSEALRRGCEQHGIRLDYRPLGQPHYGG
+IVERIIGTAMQMIHDELPGTTFSNPDQRGDYDSENKAALTLRELERWLTLAVGTYHGSVH
+NGLLQPPAARWAEAVARVGVPAVVTRATSFLVDFLPILRRTLTRTGFVIDHIHYYADGHC
+CK*
+>PROKKA_00010 Integrase core domain protein
+MNPFKGRHFQRDIILWAVRWYCKYGISYRELQEMLAERGVNVDHSTIYRWVQRYAPEMEK
+RLRWYWRNPSDLCPWHMDETYVKVNGRWAYLYRAVDSRGRTVDFYLSSRRNSKAAYRFLG
+KILNNVKKWQIPRFINTDKAPAYGRALALLKREGRCPSDVEHRQIKYRNNVIECDHGKLK
+RIIGATLGFKSMKTAYATIKGIEVMRALRKGQASAFYYGDPLGEMRLVSRVFEM*
+>PROKKA_00011 DNA-binding transcriptional regulator LysR
+MKLRHLDIFYAVMTCGSLTRAAEVLHISQPAASKALKHAEH*
+>PROKKA_00012 hypothetical protein
+MPSRFLTPYIPLVNLFSLHVYELILVTTKPKFEL*
+>PROKKA_00013 Sodium/glutamate symport carrier protein
+MILDASYTLLVACIALLIGMFVVKFTPFLQKNHIPEAVVGGFIVAIVLLIIDKTSGYSFT
+FDASLQSLLMLTFFSSIGLSSDFSRLIKGGKPLVLLTIAVTILIAIQNTVGMSMAVMMNE
+SPFIGLIAGSITLTGGHGNAGAWGPILADKYGVTGAVELAMACATLGLVLGGLVGGPVAR
+HLLKKVSIPKTTEQERDTIVEAFEQPSVKRKINANNVIETISMLIICIVVGGYISALFKD
+TFLQLPTFVWCLFVGIIIRNTLTHVFKHEVFEPTVDVLGSVALSLFLAMALMSLKFGQLA
+SMAGPVLIIIAVQTVVMVLFACFVTFKMMGKDYDAVVISAGHCGFGMGATPTAIANMQTV
+TKAFGPSHKAFLVVPMVGAFIVDISNSILIKIFIEIGTYFT*
+>PROKKA_00014 Antibiotic biosynthesis monooxygenase
+MIAVIFEVQIQPDQQTRYLTLAEELRPLLSHVAGFISIERFQSLATEGKMLSLSWWENEY
+AVLQWKNHVLHAKAQQEGRESIFDFYKISIAHITREYSFKKDKDNV*
+>PROKKA_00015 hypothetical protein
+MFDVHVVLDNQIGQLALLGKTLGNKGIGLEGGGIFTVGDECHAHFLVEQGKEAKIALEQA
+GLLVLAIRTPLIRKLKQEKPGELGEIARVLAENNINILVQYSDHANQLILITDNDSMAAS
+VTLPWAIK*
+>PROKKA_00016 Helix-turn-helix domain protein
+MSDISRVKILSALMDGRAWTATELSSVANISASTASSHLSKLLDCQLITVVAQGKHRYFR
+LAGKDIAELMESMMGISLNHGVHARVSTPVHLRKARTCYDHLAGEVAVKIYDSLCQQQWI
+TENGSMITLSGIQYFHEMGIDVPSKHSRKICCACLDWSERRFHLGGYVGAALFSLYESKG
+WLTRHLGYREVTITEKGYAAFKTHFHI*
+>PROKKA_00017 Tetracycline repressor protein class B from transposon Tn10
+MSRLDKSKVINSALELLNEVGIEGLTTRKLAQKLGVEQPTLYWHVKNKRALLDALAIEML
+DRHHTHFCPLEGESWQDFLRNNAKSFRCALLSHRDGAKVHLGTRPTEKQYETLENQLAFL
+CQQGFSLENALYALSAVGHFTLGCVLEDQEHQVAKEERETPTTDSMPPLLRQAIELFDHQ
+GAEPAFLFGLELIICGLEKQLKCESGS*
b
diff -r 000000000000 -r 68a3648c7d91 pfam_annot/prova
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pfam_annot/prova Thu Dec 22 04:45:31 2016 -0500
[
b'@@ -0,0 +1,536 @@\n+<html>\n+<head></head>\n+<body>\n+Proteins with PFAM domains:\n+<br><br>\n+<div>\n+<table cellpadding="0" width=950>\n+<td>\n+<HR SIZE=3 WIDTH=80%>\n+<center><b>PROKKA_00001</b><br>\n+</center>\n+<td bgcolor="#czb9dz">\n+<pre> \n+MENNLENLTIGVFAKAAGVNVETIRFYQRKGLLREPDKPYGSIRRYGEADVVRVKFVKSAQ<br>RLGFSLDEIAELLRLDDGTHCEEASSLAEHKLKDVREKMADLARMETVLSELVCACHARK<br>GNVSCPLIASLQGEAGLARSAMP*\n+ </pre>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<p align="left">\n+<a href=http://pfam.xfam.org/family/PF00376> PF00376</a>merR; <br>MerR family regulatory protein Prosite & Pfam-B_3021 (Release 7.5) \n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td bgcolor="#czb9dz">\n+<p align="left">\n+<a href=http://pfam.xfam.org/clan/CL0123> CL0123</a> This family contains a diverse range of mostly DNA-binding domains that contain a helix-turn-helix motif. \n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<p align="left">\n+<a href=http://pfam.xfam.org/family/PF09278> PF09278</a>MerR, DNA binding<br>Members of this family of DNA-binding domains are predominantly found in the prokaryotic transcriptional regulator MerR. They adopt a structure consisting of a core of three alpha helices, with an architecture that is similar to that of the \'winged helix\' fold  . \n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td bgcolor="#czb9dz">\n+<p align="left">\n+<a href=http://pfam.xfam.org/clan/CL0123> CL0123</a> This family contains a diverse range of mostly DNA-binding domains that contain a helix-turn-helix motif. \n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<HR SIZE=3 WIDTH=80%>\n+<center><b>PROKKA_00002</b><br>\n+</center>\n+<td bgcolor="#czb9dz">\n+<pre> \n+MSEPQNGRGALFAGGLAAILASTCCLGPLVLVALGFSGAWIGNLTVLEPYRPLFIGAALVA<br>LFFAWKRIYRPVQACKPGEVCAIPQVRATYKLIFWIVAVLVLVALGFPYVVPFFY*\n+ </pre>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<p align="left">\n+<a href=http://pfam.xfam.org/family/PF02411> PF02411</a>MerT mercuric transport protein<br>MerT mercuric transport protein MerT is an mercuric transport integral membrane protein and  is responsible for transport of the Hg2+ iron from periplasmic  MerP (also part of the transport system) to mercuric reductase (MerE). \n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<HR SIZE=3 WIDTH=80%>\n+<center><b>PROKKA_00003</b><br>\n+</center>\n+<td bgcolor="#czb9dz">\n+<pre> \n+MKKLFASLALAAAVAPVWAATQTVTLAVPGMTCAACPITVKKALSKVEGVSKVDVGFEKRE<br>AVVTFDDTKASVQKLTKATADAGYPSSVKQ*\n+ </pre>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<p align="left">\n+<a href=http://pfam.xfam.org/family/PF00403> PF00403</a>Heavy-metal-associated domain<br>\n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<HR SIZE=3 WIDTH=80%>\n+<center><b>PROKKA_00004</b><br>\n+</center>\n+<td bgcolor="#czb9dz">\n+<pre> \n+MGLMTRIADKTGALGSVVSAMGCAACFPALASFGAAIGLGFLSQYEGLFISRLLPLFAALA<br>FLANALGWFSHRQWLRSLLGMIGPAIVFAATVWLLGNWWTANLMYVGLALMIGVSIWDFV<br>SPAHRRCGPDGCELPAKRL*\n+ </pre>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<p align="left">\n+<a href=http://pfam.xfam.org/family/PF03203> PF03203</a>MerC mercury resistance protein<br>MerC mercury resistance protein \n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<HR SIZE=3 WIDTH=80%>\n+<center><b>PROKKA_00005</b><br>\n+</center>\n+<td bgcolor="#czb9dz">\n+<pre> \n+MSTLKITGMTCDSCAVHVKDALEKVPGVQSADVSYAKGSAKLAIEVGTSPDALTAAVAGLG<br>YRATLADAPSVSTPGGLLDKMRDLLGRNDKTGSSGALHIAVIGSGGAAMAAALKAVEQGA<br>RVTLIERGTIGGTCVNVGCVPSKIMIRAAHIAHLRRESPFDGGIAATTPTIQRTALLAQQ<br>QARVDELRHAKYEGILEGNPAITVLHGSARFKDNRNLIVQLNDGGERVVAFDRCLIATGA<br>SPAVPPIPGLKDTPYWTSTEALVSETIPKRLAVIGSSVVALELAQAFARLGAKVTILARS<br>TLFFREDPAIGEAVTAAFRMEGIEVREHTQASQVAYINGEGDGEFVLTTAHGELRADKLL<br>VATGRAPNTRKLALDATGVTLTPQGAIVIDPGMRTSVEHIYAAGDCTDQPQFVYVAAAAG<br>TRAAINMTGGDAALNLTAMPAVVFTDPQVATVGYSEAEAHHDGIKTDSRTLTLDNVPRAL<br>ANFDTRGFIKLVVEEGSGRLIGVQAVAPEAGELIQTAALAIRNRMTVQELADQLFPYLTM<br>VEGLKLAAQTFNKDVKQLSCCAG*\n+ </pre>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<p align="left">\n+<a href=http://pfam.xfam.org/family/PF00403> PF00403</a>Heavy-metal'..b'br>LLVLAIRTPLIRKLKQEKPGELGEIARVLAENNINILVQYSDHANQLILITDNDSMAASV<br>TLPWAIK*\n+ </pre>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<p align="left">\n+<a href=http://pfam.xfam.org/family/PF01842> PF01842</a>ACT domain<br>This family of domains generally have a regulatory role. ACT domains are linked to a wide range of metabolic enzymes that are regulated by amino acid concentration. Pairs of ACT domains bind specifically to a particular amino acid leading to regulation of the linked enzyme.  The ACT domain is found in: D-3-phosphoglycerate dehydrogenase EC:1.1.1.95 Swiss:P08328, which is inhibited by serine  . Aspartokinase EC:2.7.2.4 Swiss:P53553, which is regulated by lysine. Acetolactate synthase small regulatory subunit Swiss:P00894, which is inhibited by valine. Phenylalanine-4-hydroxylase EC:1.14.16.1 Swiss:P00439, which is regulated by phenylalanine. Prephenate dehydrogenase EC:4.2.1.51 Swiss:P21203. formyltetrahydrofolate deformylase EC:3.5.1.10, Swiss:P37051, which is activated by methionine and inhibited by glycine. GTP pyrophosphokinase EC:2.7.6.5 Swiss:P11585 \n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td bgcolor="#czb9dz">\n+<p align="left">\n+<a href=http://pfam.xfam.org/clan/CL0070> CL0070</a> These domains are involved in binding to amino-acids and causing allosteric regulation of linked enzyme domains  . The relationship between these two families was first noticed in  . \n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<HR SIZE=3 WIDTH=80%>\n+<center><b>PROKKA_00016</b><br>\n+</center>\n+<td bgcolor="#czb9dz">\n+<pre> \n+MSDISRVKILSALMDGRAWTATELSSVANISASTASSHLSKLLDCQLITVVAQGKHRYFRL<br>AGKDIAELMESMMGISLNHGVHARVSTPVHLRKARTCYDHLAGEVAVKIYDSLCQQQWIT<br>ENGSMITLSGIQYFHEMGIDVPSKHSRKICCACLDWSERRFHLGGYVGAALFSLYESKGW<br>LTRHLGYREVTITEKGYAAFKTHFHI*\n+ </pre>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<p align="left">\n+<a href=http://pfam.xfam.org/family/PF12840> PF12840</a>Helix-turn-helix domain<br>This domain represents a DNA-binding Helix-turn-helix domain found in transcriptional regulatory proteins. \n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td bgcolor="#czb9dz">\n+<p align="left">\n+<a href=http://pfam.xfam.org/clan/CL0123> CL0123</a> This family contains a diverse range of mostly DNA-binding domains that contain a helix-turn-helix motif. \n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<HR SIZE=3 WIDTH=80%>\n+<center><b>PROKKA_00017</b><br>\n+</center>\n+<td bgcolor="#czb9dz">\n+<pre> \n+MSRLDKSKVINSALELLNEVGIEGLTTRKLAQKLGVEQPTLYWHVKNKRALLDALAIEMLD<br>RHHTHFCPLEGESWQDFLRNNAKSFRCALLSHRDGAKVHLGTRPTEKQYETLENQLAFLC<br>QQGFSLENALYALSAVGHFTLGCVLEDQEHQVAKEERETPTTDSMPPLLRQAIELFDHQG<br>AEPAFLFGLELIICGLEKQLKCESGS*\n+ </pre>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<p align="left">\n+<a href=http://pfam.xfam.org/family/PF00440> PF00440</a>tetR; <br>Bacterial regulatory proteins, tetR family \n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td bgcolor="#czb9dz">\n+<p align="left">\n+<a href=http://pfam.xfam.org/clan/CL0123> CL0123</a> This family contains a diverse range of mostly DNA-binding domains that contain a helix-turn-helix motif. \n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<p align="left">\n+<a href=http://pfam.xfam.org/family/PF02909> PF02909</a>tetR_C; <br>Tetracyclin repressor, C-terminal all-alpha domain \n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td bgcolor="#czb9dz">\n+<p align="left">\n+<a href=http://pfam.xfam.org/clan/CL0174> CL0174</a> TetR protein, C-terminal domain-like This clan features families of transcriptional regulators for multidrug efflux pumps, which belong to the TetR superfamily. They are induced by the presence of a variety of factors, such as antibiotics or organic solvents. The C-terminal region featured in these families is thought to contain the inducer-binding site; the divergent sequences in this region allow for the binding of a variety of different inducers [1-4]. \n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+</table>\n+</div>\n+</body>\n+</html>\n'
b
diff -r 000000000000 -r 68a3648c7d91 pfam_annot/prova2
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pfam_annot/prova2 Thu Dec 22 04:45:31 2016 -0500
[
b'@@ -0,0 +1,660 @@\n+<html>\n+<head>\n+<style type="text/css">\n+span {\n+\ttext-decoration:underline;\n+\tcolor:blue;\n+\tcursor:pointer;\n+}\n+</style>\n+<script src="script.js"></script>\n+<script>\n+\tshow(elementID)\n+</script>\n+</head>\n+Proteins with PFAM domains:\n+<br><br>\n+<div>\n+<table cellpadding="0" width=650>\n+<td>\n+<HR SIZE=3 WIDTH=80%></HR>\n+<center><b>PROKKA_00001</b><br>\n+</center>\n+</td>\n+<tr></tr>\n+<td bgcolor="#czb9dz">\n+<pre> \n+MENNLENLTIGVFAKAAGVNVETIRFYQRKGLLREPDKPYGSIRRYGEADVVRVKFVKSAQRLGFSLDEIAELLRLDDGTHCEEASSLAE\n+HKLKDVREKMADLARMETVLSELVCACHARKGNVSCPLIASLQGEAGLARSAMP\n+\n+ </pre>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<p align="left">\n+<a href=http://pfam.xfam.org/family/PF00376> PF00376</a>\n+<p align="justify">merR; <br>MerR family regulatory protein Prosite & Pfam-B_3021 (Release 7.5) </p>\n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td bgcolor="#czb9dz">\n+<p align="left">\n+<a href=http://pfam.xfam.org/clan/CL0123> CL0123</a>\n+ <p align="justify">This family contains a diverse range of mostly DNA-binding domains that contain a helix-turn-helix motif. </p>\n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<p align="left">\n+<a href=http://pfam.xfam.org/family/PF09278> PF09278</a>\n+<p align="justify">MerR, DNA binding<br>Members of this family of DNA-binding domains are predominantly found in the prokaryotic transcriptional regulator MerR. They adopt a structure consisting of a core of three alpha helices, with an architecture that is similar to that of the \'winged helix\' fold . </p>\n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td bgcolor="#czb9dz">\n+<p align="left">\n+<a href=http://pfam.xfam.org/clan/CL0123> CL0123</a>\n+ <p align="justify">This family contains a diverse range of mostly DNA-binding domains that contain a helix-turn-helix motif. </p>\n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<HR SIZE=3 WIDTH=80%></HR>\n+<center><b>PROKKA_00002</b><br>\n+</center>\n+</td>\n+<tr></tr>\n+<td bgcolor="#czb9dz">\n+<pre> \n+MSEPQNGRGALFAGGLAAILASTCCLGPLVLVALGFSGAWIGNLTVLEPYRPLFIGAALVALFFAWKRIYRPVQACKPGEVCAIPQVRAT\n+YKLIFWIVAVLVLVALGFPYVVPFFY\n+\n+ </pre>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<p align="left">\n+<a href=http://pfam.xfam.org/family/PF02411> PF02411</a>\n+<p align="justify">MerT mercuric transport protein<br>Pfam-B_1796 (release 5.4) MerT is an mercuric transport integral membrane protein and is responsible for transport of the Hg2+ iron from periplasmic MerP (also part of the transport system) to mercuric reductase (MerE). </p>\n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<HR SIZE=3 WIDTH=80%></HR>\n+<center><b>PROKKA_00003</b><br>\n+</center>\n+</td>\n+<tr></tr>\n+<td bgcolor="#czb9dz">\n+<pre> \n+MKKLFASLALAAAVAPVWAATQTVTLAVPGMTCAACPITVKKALSKVEGVSKVDVGFEKREAVVTFDDTKASVQKLTKATADAGYPSSVK\n+Q\n+\n+ </pre>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<p align="left">\n+<a href=http://pfam.xfam.org/family/PF00403> PF00403</a>\n+<p align="justify">Heavy-metal-associated domain<br></p>\n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<HR SIZE=3 WIDTH=80%></HR>\n+<center><b>PROKKA_00004</b><br>\n+</center>\n+</td>\n+<tr></tr>\n+<td bgcolor="#czb9dz">\n+<pre> \n+MGLMTRIADKTGALGSVVSAMGCAACFPALASFGAAIGLGFLSQYEGLFISRLLPLFAALAFLANALGWFSHRQWLRSLLGMIGPAIVFA\n+ATVWLLGNWWTANLMYVGLALMIGVSIWDFVSPAHRRCGPDGCELPAKRL\n+\n+ </pre>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<p align="left">\n+<a href=http://pfam.xfam.org/family/PF03203> PF03203</a>\n+<p align="justify">MerC mercury resistance protein<br>Pfam-B_2720 (release 6.5) </p>\n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<HR SIZE=3 WIDTH=80%></HR>\n+<center><b>PROKKA_00005</b><br>\n+</center>\n+</td>\n+<tr></tr>\n+<td bgcolor="#czb9dz">\n+<pre> \n+MSTLKITGMTCDSCAVHVKDALEKVPGVQSADVSYAKGSAKLAIEVGTSPDALTAAVAGLGYRATLADAPSVSTPGGLLDKMRDLLGRND\n+KTGSSGALHIAVIGSGGAAMAAALKAVEQGARVTLIERGTIGGTCVNVGCVPSKIMIRAAHIAHLRRESPFDGGIAATTPTIQRTALLAQ\n+QQARVDELRHAKYEGILEGNPAITVLHGSARFKDNRNLIVQLNDGGERVVAFDRCLIATGASPAVPPIPGLKDTPYWTSTEALVSETIPK\n+RLAVIGSSVVALELAQAFARLGAKVTILA'..b'ustify">ACT domain<br>This family of domains generally have a regulatory role. ACT domains are linked to a wide range of metabolic enzymes that are regulated by amino acid concentration. Pairs of ACT domains bind specifically to a particular amino acid leading to regulation of the linked enzyme. The ACT domain is found in: D-3-phosphoglycerate dehydrogenase EC:1.1.1.95 Swiss:P08328, which is inhibited by serine . Aspartokinase EC:2.7.2.4 Swiss:P53553, which is regulated by lysine. Acetolactate synthase small regulatory subunit Swiss:P00894, which is inhibited by valine. Phenylalanine-4-hydroxylase EC:1.14.16.1 Swiss:P00439, which is regulated by phenylalanine. Prephenate dehydrogenase EC:4.2.1.51 Swiss:P21203. formyltetrahydrofolate deformylase EC:3.5.1.10, Swiss:P37051, which is activated by methionine and inhibited by glycine. GTP pyrophosphokinase EC:2.7.6.5 Swiss:P11585 </p>\n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td bgcolor="#czb9dz">\n+<p align="left">\n+<a href=http://pfam.xfam.org/clan/CL0070> CL0070</a>\n+ <p align="justify">These domains are involved in binding to amino-acids and causing allosteric regulation of linked enzyme domains . The relationship between these two families was first noticed in . </p>\n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<HR SIZE=3 WIDTH=80%></HR>\n+<center><b>PROKKA_00016</b><br>\n+</center>\n+</td>\n+<tr></tr>\n+<td bgcolor="#czb9dz">\n+<pre> \n+MSDISRVKILSALMDGRAWTATELSSVANISASTASSHLSKLLDCQLITVVAQGKHRYFRLAGKDIAELMESMMGISLNHGVHARVSTPV\n+HLRKARTCYDHLAGEVAVKIYDSLCQQQWITENGSMITLSGIQYFHEMGIDVPSKHSRKICCACLDWSERRFHLGGYVGAALFSLYESKG\n+WLTRHLGYREVTITEKGYAAFKTHFHI\n+\n+ </pre>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<p align="left">\n+<a href=http://pfam.xfam.org/family/PF12840> PF12840</a>\n+<p align="justify">Helix-turn-helix domain<br>This domain represents a DNA-binding Helix-turn-helix domain found in transcriptional regulatory proteins. </p>\n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td bgcolor="#czb9dz">\n+<p align="left">\n+<a href=http://pfam.xfam.org/clan/CL0123> CL0123</a>\n+ <p align="justify">This family contains a diverse range of mostly DNA-binding domains that contain a helix-turn-helix motif. </p>\n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<HR SIZE=3 WIDTH=80%></HR>\n+<center><b>PROKKA_00017</b><br>\n+</center>\n+</td>\n+<tr></tr>\n+<td bgcolor="#czb9dz">\n+<pre> \n+MSRLDKSKVINSALELLNEVGIEGLTTRKLAQKLGVEQPTLYWHVKNKRALLDALAIEMLDRHHTHFCPLEGESWQDFLRNNAKSFRCAL\n+LSHRDGAKVHLGTRPTEKQYETLENQLAFLCQQGFSLENALYALSAVGHFTLGCVLEDQEHQVAKEERETPTTDSMPPLLRQAIELFDHQ\n+GAEPAFLFGLELIICGLEKQLKCESGS\n+\n+ </pre>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<p align="left">\n+<a href=http://pfam.xfam.org/family/PF00440> PF00440</a>\n+<p align="justify">tetR; <br>Bacterial regulatory proteins, tetR family </p>\n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td bgcolor="#czb9dz">\n+<p align="left">\n+<a href=http://pfam.xfam.org/clan/CL0123> CL0123</a>\n+ <p align="justify">This family contains a diverse range of mostly DNA-binding domains that contain a helix-turn-helix motif. </p>\n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<p align="left">\n+<a href=http://pfam.xfam.org/family/PF02909> PF02909</a>\n+<p align="justify">tetR_C; <br>Tetracyclin repressor, C-terminal all-alpha domain </p>\n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td bgcolor="#czb9dz">\n+<p align="left">\n+<a href=http://pfam.xfam.org/clan/CL0174> CL0174</a>\n+ <p align="justify">TetR protein, C-terminal domain-like This clan features families of transcriptional regulators for multidrug efflux pumps, which belong to the TetR superfamily. They are induced by the presence of a variety of factors, such as antibiotics or organic solvents. The C-terminal region featured in these families is thought to contain the inducer-binding site; the divergent sequences in this region allow for the binding of a variety of different inducers [1-4]. </p>\n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+</table>\n+</div>\n+</body>\n'
b
diff -r 000000000000 -r 68a3648c7d91 pfam_annot/script.js
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pfam_annot/script.js Thu Dec 22 04:45:31 2016 -0500
[
@@ -0,0 +1,11 @@
+function show(elementID) {
+    var ele = document.getElementById(elementID);
+    if (!ele) {
+        alert("no such element");
+        return;
+    }
+    var pages = document.getElementsByClassName('page');
+    for(var i = 0; i < pages.length; i++) {
+        pages[i].style.display = 'none';
+    }
+    ele.style.display = 'block';
b
diff -r 000000000000 -r 68a3648c7d91 pfam_annot/table
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pfam_annot/table Thu Dec 22 04:45:31 2016 -0500
b
@@ -0,0 +1,51 @@
+# pfam_scan.pl,  run at Fri Jun 19 13:56:11 2015
+#
+# Copyright (c) 2009 Genome Research Ltd
+# Freely distributed under the GNU 
+# General Public License
+#
+# Authors: Jaina Mistry (jaina@ebi.ac.uk), 
+#          Rob Finn (rdf@ebi.ac.uk)
+#
+# This is free software; you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free Software
+# Foundation; either version 2 of the License, or (at your option) any later version.
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+# details.
+#
+# You should have received a copy of the GNU General Public License along with
+# this program. If not, see <http://www.gnu.org/licenses/>. 
+# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
+#      query sequence file: /home/inmare/galaxy/database/files/000/dataset_62.dat
+#     cpu number specified: 2
+#        searching against: /home/inmare/galaxy/tools/pfamScan/hmm/Pfam-A.hmm, with cut off --cut_ga
+#    resolve clan overlaps: on
+#     predict active sites: off
+# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
+#
+# <seq id> <alignment start> <alignment end> <envelope start> <envelope end> <hmm acc> <hmm name> <type> <hmm start> <hmm end> <hmm length> <bit score> <E-value> <significance> <clan>
+
+PROKKA_00001      9     46      9     46 PF00376.18  MerR              Family     1    38    38     43.1   2.2e-11   1 CL0123   
+PROKKA_00001     51    113     51    113 PF09278.6   MerR-DNA-bind     Domain     1    65    65     67.9   7.2e-19   1 CL0123   
+PROKKA_00002      1    116      1    116 PF02411.10  MerT              Family     1   116   116    214.7   1.7e-64   1 No_clan  
+PROKKA_00003     25     85     25     86 PF00403.21  HMA               Domain     1    61    62     60.3   1.4e-16   1 No_clan  
+PROKKA_00004      8    121      8    123 PF03203.9   MerC              Family     1   114   116     81.1   7.4e-23   1 No_clan  
+PROKKA_00005      3     62      3     63 PF00403.21  HMA               Domain     1    61    62     37.8   1.5e-09   1 No_clan  
+PROKKA_00005    100    410     99    412 PF07992.9   Pyr_redox_2       Domain     2   199   201    140.8   4.8e-41   1 CL0063   
+PROKKA_00005    271    345    271    354 PF00070.22  Pyr_redox         Domain     1    74    80     51.6   9.3e-14   1 CL0063   
+PROKKA_00005    440    548    440    549 PF02852.17  Pyr_redox_dim     Domain     1   109   110    104.5   2.9e-30   1 No_clan  
+PROKKA_00006      4     72      4     73 PF13411.1   MerR_1            Family     1    68    69     44.0   1.3e-11   1 CL0123   
+PROKKA_00007      2     74      1     75 PF05052.7   MerE              Family     2    74    75    142.4     3e-42   1 No_clan  
+PROKKA_00008     13    218     11    219 PF00563.15  EAL               Domain     3   209   236    169.6   7.3e-50   1 No_clan  
+PROKKA_00009     26     78     26     87 PF13518.1   HTH_28            Domain     1    46    52     26.9   3.4e-06   1 CL0123   
+PROKKA_00009    173    313    172    313 PF00665.21  rve               Domain     2   120   120     85.4   2.9e-24   1 CL0219   
+PROKKA_00010     75    212     73    213 PF13610.1   DDE_Tnp_IS240     Domain     4   139   140    170.7   1.6e-50   1 CL0219   
+PROKKA_00011      3     40      3     41 PF00126.22  HTH_1             Domain     1    38    60     48.8   4.1e-13   1 CL0123   
+PROKKA_00013      2    366      1    366 PF03616.9   Glt_symporter     Family     2   368   368    544.8  6.1e-164   1 CL0064   
+PROKKA_00014      1     76      1     77 PF03992.11  ABM               Domain     1    77    78     48.0   8.8e-13   1 CL0032   
+PROKKA_00015     78     98     74    115 PF01842.20  ACT               Domain     9    29    66     20.5   0.00024   1 CL0070   
+PROKKA_00016      5     55      1     55 PF12840.2   HTH_20            Domain    11    61    61     35.6   5.6e-09   1 CL0123   
+PROKKA_00017     10     55      9     55 PF00440.18  TetR_N            Domain     2    47    47     53.5   1.3e-14   1 CL0123   
+PROKKA_00017     68    201     68    201 PF02909.12  TetR_C            Domain     1   139   139    165.1   6.5e-49   1 CL0174   
b
diff -r 000000000000 -r 68a3648c7d91 pfam_search/annota.Filter.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pfam_search/annota.Filter.pl Thu Dec 22 04:45:31 2016 -0500
[
@@ -0,0 +1,206 @@
+#!/usr/bin/perl -w
+
+use strict;
+my $d_file="/home/inmare/galaxy/tools/pfam_search/pfamA.txt";
+open(IN,$d_file);
+my %decode=();
+my %clan_decode;
+my $id="";
+my %c=();
+
+
+my ($prot_file,$pfam_file,$prefix,@search_T)=@ARGV;
+my $searchP="";
+while(<IN>)
+{
+        if ($_=~/^\d/)
+        {
+                my @vl=(split(/\t+/));
+ $decode{$vl[1]}="$vl[3]<br>";#$vl[8] $vl[9]";
+                my $cc=0;
+ my %repeated=();
+ foreach my $v (@vl)
+                {
+ $v=~s/\[\d+\]/ /g;
+ $cc++;
+ last if $v=~/hmmbuild/;
+                        last if $cc>10;
+                        next if $v=~/anon/;
+                        next if $v=~/Bates/;
+                        next if $v=~/Cogis/;
+                        next if $v=~/Bateman/;
+                        next if $v=~/Sonnhammer/;
+                        next if $v=~/Finn/;
+ next if $v=~/Studholme/;
+ next if $v eq $vl[3];
+ next if $v=~/Kerrison/;
+                        next if $repeated{$v};
+                        #next if length($v)>=30 && $cc<=10;
+ $decode{$vl[1]}.="$v " if length($v)>=20 && $cc<=10;
+                 $repeated{$v}++;
+ }
+        }
+}
+close(IN);
+
+my $clan_file="/home/inmare/galaxy/tools/pfam_search/clans.txt";
+open(IN,$clan_file);
+while(<IN>)
+{
+        my @vl=(split(/\t/));
+        #$clan_decode{$vl[1]}="$vl[3]";
+ my $cc=0;
+        foreach my $v (@vl)
+ {
+ $cc++;
+ $v=~s/\[\d+\]/ /g;
+ $clan_decode{$vl[1]}.="$v " if length($v) >=30 && $cc<=10;
+ }
+
+}
+close(IN);
+open(IN,"$prot_file");
+while(<IN>)
+{
+        if ($_=~/^>(.*)/)
+        {
+                $id=$1;
+ $id=(split(/\s+/,$id))[0];
+        }else{
+ chomp;
+                $c{$id}.=$_;
+        }
+}
+close(IN);
+
+foreach my $s (@search_T)
+{
+ $searchP.="$s ";
+}
+
+open(OUT,">$prefix");
+print OUT "<html>\n<head></head>\n<body>\n";
+my $color="\"#czb9dz\"";
+my %printed;
+open(IN,$pfam_file);
+print OUT "Proteins with PFAM domains matching the keywords:\n<br><br>\n";
+print OUT "<div>\n<table cellpadding=\"0\" width=650>\n";
+my $ntokens=0;
+while(<IN>)
+{
+        next if $_=~/^\#/;
+        my ($name,$domain,$clan)=(split(/\s+/))[0,5,-1];
+ next unless $name;
+ next unless $domain;
+ $domain=~s/\.\d+//;
+ my $sd=$decode{$domain} ? $decode{$domain} : "MagnottaPantaleo§§";
+ my $sc=$clan_decode{$clan} ? $clan_decode{$clan} : "SciarrattaCalogero@@";
+ my $continue=match($searchP,$sd,$sc);
+ #print "$name $domain $clan $sd $sc\n";
+ next unless $continue;
+        unless ($printed{$name})
+ {
+ my $seq=$c{$name};
+
+ $seq=~s/\*//g;
+ $seq=form($seq,90);
+ print OUT "<td>\n";
+ print OUT "<HR SIZE=3 WIDTH=80%>\n";
+ print OUT "<center><b>$name</b><br>\n</center>\n";
+ print OUT "</td\n<tr></tr>\n";
+                print OUT "<td bgcolor=$color>\n";
+ print OUT "<pre> \n$seq\n </pre>\n";
+ print OUT "</td>\n<tr></tr>\n<td></td>\n<tr></tr>\n";
+ $ntokens=2;
+ }
+ my $hd=uc $domain;
+ #<a href="http://www.canoro.altervista.org/" class="nav" target="_blank">www.canoro.altervista.org</a>
+ if ($decode{$domain})
+ {
+ my $ddes=$decode{$domain};
+ if ($ntokens % 2==0)
+                {
+                        print OUT "<td>\n";
+                }else{
+                        print OUT "<td bgcolor=$color>\n";
+                }
+
+ print OUT "<p align=\"left\">\n";
+         print OUT "<a href=http://pfam.xfam.org/family/$hd> $domain</a>\n<p align=\"justify\">$ddes</p>\n\n";
+ print OUT "</p>\n</td>\n<tr></tr>\n<td></td>\n<tr></tr>\n";
+ $ntokens++;
+ }
+        if ($clan_decode{$clan})
+        {
+                my $clanD=$clan_decode{$clan};
+ my $ddes=$decode{$domain};
+                if ($ntokens % 2==0)
+                {
+                        print OUT "<td>\n";
+                }else{
+                        print OUT "<td bgcolor=$color>\n";
+                }
+
+                print OUT "<p align=\"left\">\n";
+ print OUT "<a href=http://pfam.xfam.org/clan/$clan> $clan</a>\n <p align=\"justify\">$clanD</p>\n\n";
+ print OUT "</p>\n</td>\n<tr></tr>\n<td></td>\n<tr></tr>\n";
+ $ntokens++;
+        }
+        $printed{$name}=1;
+}
+
+#print OUT "<br><br>Proteins without PFAM domains:\n<br>\n";
+#foreach my $seq (keys %c)
+#{
+#        next if $printed{$seq};
+#        print OUT "<>$seq</pre>\n\n<br><br><left>\n$c{$seq}</left><br>\n";
+# print OUT "<HR SIZE=3 WIDTH=80%>\n";
+#}
+print OUT "</table>\n</div>\n</body>\n</html>\n";
+close(OUT);
+
+sub form
+{
+ my $string=$_[0];
+ my $len=$_[1];
+ my $outS="";
+ my @vl=split('',$string);
+ for (my $i=1;$i<=$#vl;$i++)
+ {
+ if ($i % $len==0 && $i>0)
+ {
+ $outS.="$vl[$i-1]\n";
+ }else{
+ $outS.=$vl[$i-1];
+ } 
+ }
+ $outS.="\n";
+ $outS=~s/ //g;
+ return $outS;
+}
+
+sub match
+{
+ my $terms=$_[0];
+ my $d1=$_[1];
+ my $d2=$_[2];
+ $terms=~s/AND/ /g;
+ my @t1s=(split(/OR/,$terms));
+ my $print_out=0;
+ foreach my $t (@t1s)
+ {
+ #print "$t\n";
+ last if $print_out==1; #OR non serve se 1 Ã¨ verificata;
+ #permute
+ my @vl=(split(/\s+/,$t));
+ my $nm1=0;
+ my $nm2=0;
+ foreach my $v (@vl)
+ {
+ $nm1++ if ($d1=~/$v/i);
+ $nm2++ if ($d1=~/$v/i);
+ }
+ $print_out=1 if ($nm1==($#vl+1)) || ($nm2==($#vl+1));
+ }
+ return $print_out;
+}
b
diff -r 000000000000 -r 68a3648c7d91 pfam_search/clans.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pfam_search/clans.txt Thu Dec 22 04:45:31 2016 -0500
[
b"@@ -0,0 +1,515 @@\n+1\tCL0001\tEGF\t\\N\tEGF superfamily\tFinn RD, Bateman A\tanon\tMembers of this clan all belong to the EGF superfamily.  This particular superfamily is characterised as having least 6 cysteines residues.\\\t\\\t\\\t\\\t   These cysteine form disulphide bonds, in the order 1-3, 2-4, 5-6, which are essential for the stability of  the EGF fold.    These disulphide bonds are stacked in a ladder-like arrangement.  The Laminin EGF family is distinguished by having an an additional disulphide bond.  The function of the domains within  this family remains unclear, but they are though to largely  perform a structural role.  More often than not, there domains are arranged a tandem repeats in extracellular proteins.\t2008-09-03 15:50:29\t2004-03-17 16:02:08\t26\t325\t6259\t696\t88541\t1\n+3\tCL0003\tSAM\t\\N\tSterile Alpha Motif (SAM) domain\tFinn RD\tanon\tSAM domains are found in a diverse set of proteins, which include  scaffolding proteins, transcription regulators, translational  regulators tyrosine kinases and serine/threonine kinases [1-3].   SAM domains are found in all eukaryotes and some bacteria [3] .   Structures of SAM domains reveal a common five helical structure.  The SAM domain is involved in a variety of functions. The most widespread function is in domain-domain interactions.   The SAM domain performs domain-domain interactions using multifarious  arrangements of the SAM domain.  More recently, the SAM domain within  the Smaug protein has been demonstrated to bind to the Nanos 3' UTR translation control element (Rfam:RF00161) [3]. This clan currently only represents the diverse SAM domain family and does not contain the more divergent SAM/Pointed family (Pfam:PF02198).\t2008-09-03 15:50:29\t2004-03-17 16:21:50\t20\t126\t742\t467\t11010\t1\n+4\tCL0004\tConcanavalin\t\\N\tConcanavalin-like lectin/glucanase superfamily\tBateman A\tanon\tThis superfamily includes a diverse range of carbohydrate binding domains and glycosyl hydrolase enzymes that share a common structure.\t2008-09-03 15:50:29\t2004-03-17 16:44:11\t19\t1631\t2750\t3131\t34755\t1\n+5\tCL0005\tKazal\t\\N\tKazal like domain\tFinn RD\tanon\tKazal domains are found in both serine protease inhibitors and extracellular regions of agrins. The structure of the Kazal domain is a small alpha/beta fold. Typically the Kazal domain consists of 2 short-helices and a  3-stranded anti-parallel sheet.  The fold is contains several disulphide bonds. \t2008-09-03 15:50:29\t2004-03-17 17:00:11\t26\t106\t337\t450\t6552\t1\n+6\tCL0006\tC1\t\\N\tProtein kinase C, C1 domain\tFinn RD\tanon\tThe members of this clan are all variations of the protein kinase C1 domain that is characterised by a rich cysteine and histidine content.  The C1 domain is the N-terminal region of conservation found in protein kinase C domains.  This domain is involved in binding many ligands, which include diacylglycerol, phorbol  esters and zinc [1].\t2008-09-03 15:50:29\t2004-03-17 17:47:56\t19\t30\t728\t396\t10495\t1\n+7\tCL0007\tKH\t\\N\tK-Homology (KH) domain Superfamily\tFinn RD\tanon\tThe KH domain is thought to be the second most prevalent RNA  binding motif in proteins.  The motif is characterised by a  conserved GXXXGXXG in the middle of the domain.  Structures of KH reveal that the KH domain is arranged as either a  beta-alpha-alpha-beta-beta (mini-KH domain) or  beta-alpha-alpha-beta-beta-alpha (maxi-KH domain).  The secondary elements are separated by at least four loop segments. The second loop is located between beta-1 and al  The KH domain can be found either as single or multiple copies.  The KH domain usually binds RNA as a multimer.\t2008-09-03 15:50:29\t2004-03-17 17:58:30\t17\t312\t491\t5344\t38636\t1\n+9\tCL0009\tENTH_VHS\t\\N\tENTH/ANTH/VHS superfamily\tBateman A, McMahon H\tanon\tThis clan includes the related ENTH and ANTH domains as well as the VHS domain. The ENTH domain is approximately 150 residues in length and is a solenoid of alpha-helices. The various ENTH domains have various lipid specificities but the key feature that distinguishes it functionally from ANTH domains is it"..b'phorylated tyrosine residues on its interacting protein-partner.\t2012-02-21 16:19:05\t2012-02-21 16:19:05\t1\t387\t421\t453\t9393\t1\n+542\tCL0542\tRAS_GEF_N\t\tRas guanyl-nucleotide exchange factor activity N-term\tCoggill P\tpcc\tThis is the more N-terminal domain of the RAS-GEF superfamily.\t2012-02-29 11:57:43\t2012-02-29 11:57:43\t1\t14\t157\t277\t2682\t1\n+543\tCL0543\tViral_gly_cn_dm\t\tViral glycoprotein central and dimerisation domains\tEberhardt R\tre3\tFlavivirus and alpha virus glycoprotein E/E1 consists of three domains. A dimerisation domain, a central domain and an immunoglobulin-like domain. The dimerisation and central domains are intertwined [1-3].\t2012-03-23 15:37:00\t2012-03-23 15:37:00\t1\t93\t37\t200\t12815\t1\n+544\tCL0544\tAcylCoA_ox_dh_N\t\tAcyl-coenzyme A oxidase/dehydrogenase N-terminal\tEberhardt R\tre3\tAcyl-CoA dehydrogenases and acyl-coenzyme A oxidases consist of three domains. An N-terminal all alpha domain, a beta-barrel middle domain and a C-terminal catalytic domain [1-2].\t2012-03-27 08:17:33\t2012-03-27 09:17:33\t1\t223\t112\t3478\t26792\t1\n+545\tCL0545\tAPCOP-app_sub\t\tClathrin (AP) and COPI appendage platform  subdomain\tCoggill P\tpcc\tThis superfamily is characterised by subdomains from the clathrin and coatomer appendages. The superfamily possesses a single protein/protein interaction site that in yeast binds to the ARFGAP Glo3p, and in mammalian gamma-COP binds to a Glo3p orthologue, ARFGAP2 [1].\t2012-04-18 13:17:34\t2012-04-18 14:17:34\t1\t0\t11\t303\t446\t1\n+546\tCL0546\tHexosaminidase\t\tbeta-N-acetylhexosaminidase-like domain\tCoggill P\tpcc\tThis superfamily is characterised by a mixed beta sheet with connection over the free side of the sheet. The fold is like a zincin fold lacking the catalytic centre.\t2012-04-30 14:53:24\t2012-04-30 15:53:24\t1\t124\t150\t1212\t3278\t1\n+547\tCL0547\tGF_recep_C-rich\t\tGrowth factor receptor Cys-rich\tEberhardt R\tre3\tThe cysteine-rich regions of growth factor receptor tyrosine kinases consist of eight disulphide-linked modules [1].\t2012-05-01 09:19:56\t2012-05-01 10:19:56\t1\t49\t88\t140\t2125\t1\n+548\tCL0548\tIHF-likeDNA-bdg\t\tIHF-like DNA-binding protein supewrfamily\tCoggill P\tpcc\tThis superfamily is characterised by being a dimer of identical subunits of a core of four helices in a bundle, partly opened, capped with a beta-sheet. All members appear to be prokaryotic DNA-binding domains.\t2012-05-01 16:00:04\t2012-05-01 17:00:04\t1\t60\t13\t4459\t12213\t1\n+549\tCL0549\tNicO_HupE_DsbD\t\tNicO/HupE/DsdD superfamily\tEberhardt R\tre3\tThis clan contains the nickel transpot family NicO [1-2] and the HupE/UreJ proteins, which may be involved in nickel binding. NicO and HupE contain a conserved GxxHxxxDH motif, which may bind to nickel.\t2012-05-31 10:50:16\t2012-05-31 11:50:16\t1\t0\t55\t3953\t10451\t1\n+550\tCL0550\tSRCR\t\tSRCR-like\tEberhardt R\tre3\t\t2012-07-11 13:20:22\t2012-07-11 14:20:22\t1\t19\t402\t139\t9451\t1\n+551\tCL0551\tBCLiA\t\tBcl-2 inhibitors of programmed cell death\tCoggill P\tpcc\tThis superfamily is characterised by families of proteins that inhibit apoptosis, They are regulated by all BH3-only proteins to promote apoptosis.\t2012-08-31 14:55:14\t2012-08-31 15:55:14\t1\t151\t28\t348\t1402\t1\n+552\tCL0552\tHect\t\tHect, E3 ligase catalytic domain\tCoggill P\tpcc\tThis superfamily is characterised by fmailies with E3-ligase catalytic acitvity.  The fold consists of two alpha+beta domains; where the N-terminal domain is an array of helices and beta-hairpins; the C-terminal domain is an a/b sandwich with one left-handed beta-alpha(n)-beta unit.\t2012-09-07 15:42:50\t2012-09-07 16:42:50\t1\t23\t249\t342\t4684\t1\n+553\tCL0553\tHBMR\t\tHelical backbone metal receptor superfamily\tCoggill P\tpcc\tThis superfamily is characterised by a long alpha helical insertion in the interdomain linker in the Chelatase-like fold. Representative families include the periplasmic ferric siderophore binding proteins, the TroA-like nitrogenase iron-molybdenum proteins and the putative iron(III) transporter family, TM0189-like, or periplasmic-binding family.\t2012-09-12 10:08:39\t2012-09-12 11:08:39\t1\t190\t51\t4828\t19772\t1\n'
b
diff -r 000000000000 -r 68a3648c7d91 pfam_search/f1
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pfam_search/f1 Thu Dec 22 04:45:31 2016 -0500
b
b'@@ -0,0 +1,504 @@\n+\n+1\n+# pfam_scan.pl, run at Fri Jul 3 12:01:47 2015\n+#\n+# Copyright (c) 2009 Genome Research Ltd\n+# Freely distributed under the GNU\n+# General Public License\n+#\n+# Authors: Jaina Mistry (jaina@ebi.ac.uk),\n+# Rob Finn (rdf@ebi.ac.uk)\n+#\n+# This is free software; you can redistribute it and/or modify it under\n+# the terms of the GNU General Public License as published by the Free Software\n+# Foundation; either version 2 of the License, or (at your option) any later version.\n+# This program is distributed in the hope that it will be useful, but WITHOUT\n+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS\n+# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more\n+# details.\n+#\n+# You should have received a copy of the GNU General Public License along with\n+# this program. If not, see <http://www.gnu.org/licenses/>.\n+# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =\n+# query sequence file: /home/inmare/galaxy/database/files/000/dataset_578.dat\n+# cpu number specified: 2\n+# searching against: /home/inmare/galaxy/tools/pfamScan/hmm/Pfam-A.hmm, with cut off --cut_ga\n+# resolve clan overlaps: on\n+# predict active sites: off\n+# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =\n+#\n+# <seq id> <alignment start> <alignment end> <envelope start> <envelope end> <hmm acc> <hmm name> <type> <hmm start> <hmm end> <hmm length> <bit score> <E-value> <significance> <clan>\n+10FR_NODE_6#PROKKA_00001 25 131 23 132 PF14691.1 Fer4_20 Domain 3 110 111 91.1 3.2e-26 1 CL0344\n+10FR_NODE_6#PROKKA_00002 28 394 28 394 PF00310.16 GATase_2 Domain 1 361 361 446.4 5.4e-134 1 CL0052\n+10FR_NODE_6#PROKKA_00002 471 755 468 755 PF04898.9 Glu_syn_central Domain 4 288 288 358.5 2.3e-107 1 CL0036\n+10FR_NODE_6#PROKKA_00002 815 1178 815 1178 PF01645.12 Glu_synthase Family 1 368 368 503.8 2.4e-151 1 CL0036\n+10FR_NODE_6#PROKKA_00002 1259 1450 1256 1452 PF01493.14 GXGXG Family 4 200 202 228.0 5.3e-68 1 No_clan\n+10FR_NODE_6#PROKKA_00003 6 81 6 82 PF13740.1 ACT_6 Domain 1 75 76 81.8 2.1e-23 1 CL0070\n+10FR_NODE_6#PROKKA_00003 196 364 196 364 PF12710.2 HAD Family 1 192 192 66.9 2.5e-18 1 CL0137\n+10FR_NODE_6#PROKKA_00004 95 263 94 264 PF02222.17 ATP-grasp Family 2 171 172 138.3 1.5e-40 1 CL0179\n+10FR_NODE_6#PROKKA_00005 5 153 4 154 PF00731.15 AIRC Domain 2 149 150 217.4 4.8e-65 1 No_clan\n+10FR_NODE_6#PROKKA_00006 98 355 93 355 PF02353.15 CMAS Family 5 273 273 296.5 1.4e-88 1 CL0063\n+10FR_NODE_6#PROKKA_00007 44 78 43 79 PF07676.7 PD40 Repeat 2 38 39 35.2 6.5e-09 1 CL0186\n+10FR_NODE_6#PROKKA_00007 99 131 98 135 PF07676.7 PD40 Repeat 2 33 39 14.8 0.017 1 CL0186\n+10FR_NODE_6#PROKKA_00007 344 368 343 372 PF07676.7 PD40 Repeat 7 33 39 19.4 0.00057 1 CL0186\n+10FR_NODE_6#PROKKA_00007 391 409 375 412 PF07676.7 PD40 Repeat 8 26 39 12.3 0.098 1 CL0186\n+10FR_NODE_6#PROKKA_00007 481 501 473 505 PF07676.7 PD40 Repeat 11 33 39 12.2 0.1 1 CL0186\n+10FR_NODE_6#PROKKA_00007 681 748 680 749 PF14684.1 Tricorn_C1 Domain 2 69 70 76.4 1.1e-21 1 No_clan\n+10FR_NODE_6#PROKKA_00007 761 851 761 852 PF14685.1 Tricorn_PDZ Domain 1 87 88 80.6 5.1e-23 1 CL0466\n+10FR_NODE_6#PROKKA_00007 881 1036 881 1036 PF03572.13 Peptidase_S41 Family 1 169 169 68.4 4.5e-19 1 CL0127\n+10FR_NODE_6#PROKKA_00008 4 214 4 225 PF02811.14 PHP Family 1 170 175 43.3 3.2e-11 1 CL0034\n+10FR_NODE_6#PROKKA_00009 1 48 1 48 PF13793.1 Pribosyltran_N Domain 70 116 116 63.1 1.8e-17 1 No_clan\n+10FR_NODE_6#PROKKA_00009 130 237 120 238 PF14572.1 Pribosyl_synth Domain 74 183 184 84.5 7.3e-24 1 CL0533\n+10FR_NODE_6#PROKKA_00010 6 90 6 90 PF01386.14 Ribosomal_L25p Domain 1 88 88 84.9 2.9e-24 1 No_clan\n+10FR_NODE_6#PROKKA_00010 98 180 97 183 PF14693.1 Ribosomal_TL5_C Domain 2 85 88 63.7 1.3e-17 1 No_clan\n+10FR_NODE_6#PROKKA_00011 4 185 4 185 PF01195.14 Pept_tRNA_hydro Domain 1 184 184 190.2 2.3e-56 1 No_clan\n+10FR_NODE_6#PROKKA_00013 53 132 53 132 PF08545.5 ACP_syn_III Domain 1 80 80 102.2 8.2e-30 1 CL0046\n+10FR_NODE_6#PROKKA_00013 '..b'9 21 98 20 105 PF00005.22 ABC_tran Domain 2 78 137 63.3 2.7e-17 1 CL0023\n+unf328_NODE_30#PROKKA_00360 32 350 30 381 PF13458.1 Peripla_BP_6 Family 3 308 343 154.0 6.2e-45 1 CL0144\n+unf328_NODE_30#PROKKA_00361 1 193 1 194 PF01266.19 DAO Domain 170 357 358 57.2 1.3e-15 1 CL0063\n+unf339_NODE_31#PROKKA_00362 2 267 1 267 PF07592.6 DDE_Tnp_ISAZ013 Domain 46 311 311 409.3 6.7e-123 1 CL0219\n+unf339_NODE_31#PROKKA_00364 12 221 9 249 PF00378.15 ECH Family 5 215 245 179.9 4.3e-53 1 CL0127\n+unf350_NODE_32#PROKKA_00367 3 227 3 228 PF01370.16 Epimerase Family 1 235 236 137.7 3.9e-40 1 CL0063\n+unf350_NODE_32#PROKKA_00369 25 104 5 105 PF01266.19 DAO Domain 279 357 358 56.4 2.1e-15 1 CL0063\n+unf361_NODE_33#PROKKA_00371 108 224 64 238 PF01695.12 IstB_IS21 Family 48 162 178 54.1 1.2e-14 1 CL0023\n+unf383_NODE_35#PROKKA_00375 12 72 10 72 PF00392.16 GntR Family 3 64 64 64.5 4.2e-18 1 CL0123\n+unf383_NODE_35#PROKKA_00375 82 201 82 203 PF07729.7 FCD Domain 1 123 125 92.2 2.9e-26 1 CL0388\n+unf866_NODE_7#PROKKA_00376 25 84 24 100 PF01926.18 MMR_HSR1 Family 2 57 116 56.4 2.5e-15 1 CL0023\n+unf866_NODE_7#PROKKA_00377 5 219 2 221 PF01902.12 ATP_bind_4 Family 4 216 219 107.8 4.7e-31 1 CL0039\n+unf866_NODE_7#PROKKA_00378 7 184 6 185 PF02737.13 3HCDH_N Domain 2 179 180 226.0 2.4e-67 1 CL0063\n+unf866_NODE_7#PROKKA_00378 187 283 187 283 PF00725.17 3HCDH Domain 1 97 97 118.2 1.5e-34 1 CL0106\n+unf866_NODE_7#PROKKA_00379 6 198 3 242 PF00753.22 Lactamase_B Domain 4 157 194 65.4 5.2e-18 1 CL0381\n+unf866_NODE_7#PROKKA_00382 14 293 12 294 PF01180.16 DHO_dh Domain 3 293 295 240.8 1.4e-71 1 CL0036\n+unf866_NODE_7#PROKKA_00383 36 254 36 257 PF02358.11 Trehalose_PPase Family 1 232 235 87.8 4.8e-25 1 CL0137\n+unf866_NODE_7#PROKKA_00384 5 469 4 470 PF00982.16 Glyco_transf_20 Family 2 473 474 483.7 4.5e-145 1 CL0113\n+unf866_NODE_7#PROKKA_00385 5 108 4 109 PF01740.16 STAS Domain 2 116 117 48.3 5.4e-13 1 CL0502\n+unf866_NODE_7#PROKKA_00386 14 137 13 137 PF13581.1 HATPase_c_2 Domain 2 125 125 101.7 2.2e-29 1 CL0025\n+unf866_NODE_7#PROKKA_00387 204 305 204 305 PF03448.12 MgtE_N Domain 1 102 102 89.0 2e-25 1 CL0436\n+unf866_NODE_7#PROKKA_00387 311 365 307 368 PF00571.23 CBS Domain 5 54 57 16.9 0.004 1 No_clan\n+unf866_NODE_7#PROKKA_00387 378 425 371 427 PF00571.23 CBS Domain 8 55 57 30.6 2e-07 1 No_clan\n+unf866_NODE_7#PROKKA_00388 35 387 32 387 PF01566.13 Nramp Family 4 358 358 286.2 2.8e-85 1 CL0062\n+unf866_NODE_7#PROKKA_00390 19 154 18 156 PF08327.6 AHSA1 Family 2 122 124 57.6 1.2e-15 1 CL0209\n+unf866_NODE_7#PROKKA_00391 9 68 9 68 PF12840.2 HTH_20 Domain 1 61 61 59.1 2.7e-16 1 CL0123\n+unf866_NODE_7#PROKKA_00393 12 427 10 427 PF00275.15 EPSP_synthase Family 3 419 419 342.8 1.9e-102 1 CL0290\n+unf866_NODE_7#PROKKA_00394 11 236 9 236 PF13505.1 OMP_b-brl Domain 3 176 176 51.2 1.3e-13 1 CL0193\n+unf866_NODE_7#PROKKA_00395 55 188 54 188 PF13492.1 GAF_3 Domain 2 129 129 64.5 9.6e-18 1 CL0161\n+unf866_NODE_7#PROKKA_00395 217 352 217 352 PF13492.1 GAF_3 Domain 1 129 129 71.3 7.3e-20 1 CL0161\n+unf866_NODE_7#PROKKA_00395 415 602 414 603 PF07228.7 SpoIIE Family 2 192 193 132.6 1.3e-38 1 CL0238\n+unf866_NODE_7#PROKKA_00396 20 134 14 135 PF00156.22 Pribosyltran Domain 8 124 125 60.3 1.4e-16 1 CL0533\n+unf866_NODE_7#PROKKA_00397 4 242 2 243 PF01790.13 LGT Family 5 255 256 151.8 1.6e-44 1 No_clan\n+unf866_NODE_7#PROKKA_00399 51 235 51 236 PF01612.15 DNA_pol_A_exo1 Domain 1 175 176 111.4 3e-32 1 CL0219\n+unf866_NODE_7#PROKKA_00399 267 646 265 646 PF00476.15 DNA_pol_A Family 3 383 383 431.2 2.5e-129 1 No_clan\n+unf866_NODE_7#PROKKA_00400 9 383 9 384 PF01053.15 Cys_Met_Meta_PP Domain 1 385 386 438.0 1.8e-131 1 CL0061\n+unf866_NODE_7#PROKKA_00401 253 312 233 312 PF13517.1 VCBS Repeat 1 61 61 34.1 2.6e-08 1 CL0186\n+unf866_NODE_7#PROKKA_00402 41 209 38 209 PF02572.10 CobA_CobO_BtuR Family 4 172 172 136.4 6.9e-40 1 CL0023\n+unf866_NODE_7#PROKKA_00403 78 252 71 256 PF13535.1 ATP-grasp_4 Domain 9 180 184 35.9 5.8e-09 1 CL0179\n+unf866_NODE_7#PROKKA_00404 58 131 56 132 PF03061.17 4HBT Domain 3 78 79 38.8 7.3e-10 1 CL0050\n'
b
diff -r 000000000000 -r 68a3648c7d91 pfam_search/f2
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pfam_search/f2 Thu Dec 22 04:45:31 2016 -0500
b
b'@@ -0,0 +1,3012 @@\n+>10FR_NODE_6#PROKKA_00001\n+MGDPKGFMTVPRKEASYRPRNERIYDFGEVEQTLNEEDRKLQTSRCMDCGVPFCHWGCPV\n+GSKIPEWQDAYYRGNEEVAYEILHSTNSFPEITGRICPAPCEKSCVLSINEEPVTIRENE\n+AANVENAFTQGWIKANPPSIRVV*\n+\n+>10FR_NODE_6#PROKKA_00002\n+MEKKRKEMAFRFPKPDGLYNPANEHDNCGIGFVAHIKGEASHDIVERGLEVLRNLDHRGA\n+KGSDNASGDGAGVMVQIPHEFIKKVLKIDVPAKGSYGTGLIFLPQLEAEANACVDILSNI\n+IQEEGLQLIGYRDVPTDSSIPGEIARTTEPRIKQVFIKANLEEDILEQKLYIVRKRAEKA\n+VQASDLSQKEVYYHSSLSAKTMIYKGMLTPDQMKDYFTDLQHPLFKSALILIHSRFSTNT\n+FPTWDLAQPFRLVAHNGEINTIKGNRLWTQAREGLLKSEVFGDDLPKILPVLEEGKSDSA\n+SFDNVLEFLHRTGRSLHHSLCMMIPESFNEKNPIPESLKAFYEYHSTIMEPWDGPASIVF\n+SDGRYIGGTLDRNGLRPSRYVITKNDLIVMASETGVQDFAAEEILEKGRLRPGKILLVDT\n+RLGIIIPDEEVKEQLSRRNPYGMWLKENRLLMEDIKVRQRVPSTMDDFLTYAKVFSYSKE\n+DMEFLIQSMSNTAVEPINSMGNDTPAAIFSRQPQRLFNYFKQTFAQVTNPPIDAIREGLV\n+MSLTNYIGSLNSNILKESPDHCKLIKFPDPIVTNTDLGKIKDLKDEMFSHEIISIVFPVD\n+QGFEGFKKAFDEMLERAEKAVDDKKNFIILSDRAIDSKHAPFPSLLAVSAVHHHLIQKKK\n+RMQVGIAVETGEAREVNHYALLLGYGASVINPYLAFAAVDHLVKEGKLDMEYKDARRNYI\n+KSIKKGLLKIFSKMGISTVRSYHGAQIFGAFGLSKELVDKYFKGTSSPISGIGLEEIYEE\n+YSQFHKDAFREEATQEKFRFETTGVYAWRKNREDHAWNPDSIGLLQWATRTNSYEKFKEY\n+SRTVDEYNRKPSFIRGCFQVKRNSISIEEVEPVEEIMKRFVTGAMSYGSISKEAHESLAV\n+AMNTVGGRSNTGEGGEDHNRFGTEKQSAIKQIASGRFGVTSNYLTNAREIQIKIAQGAKP\n+GEGGQLPGYKVNEVIAKLRNSTPGITLISPPPHHDIYSIEDLAELIYDLKATNPKAKISV\n+KLVSQDGVGTVAAGVAKAFADLIIISGGEGGTGASPISSIKHAGLPVEIGIAEAQQTLVK\n+NNLRGRVKIQVDGQLKNGHDIVTMACLGAEEFGFATSALITLGCVMMRKCHLNTCPTGIA\n+TQDETLRERFTGNPQLVINFFRFLAGEVRELLAEMGFKKFDDIIGRADLLEENKEVFGWK\n+MKNVDFSAVLNRPAEADKFDIRYVPGSASLNLDGHLDHTLIEESGKAIKGKEKVWLHHPF\n+ANTDRAIGAMLSGVISQKYGEFGLPEDTIHATFDGSAGQSFGAFLAKGVTFRLEGDSNDY\n+IGKGLSGGKIIVVPPTGSTFTPEENIIIGNSTFYGATGGEAYIQGVAGERFCVRNSGMEA\n+VIEGAGDHCCEYMTGGRVVVLGKTGRNFAAGMSGGIAYVLDEDGDFDFYCNKGLVELLEV\n+EDKKDIKELQGLISKHLTYTQSPKAAKILTQWEEYLPKFVKVIPYEYRKVLRERELRELE\n+QKMKMTEDANVMQE*\n+\n+>10FR_NODE_6#PROKKA_00003\n+MLEQKELILLNISGEDKPGLTASLTEILSQHNVNILDIGQSVIHKDLGLGILFEVPKKYR\n+SASILKDLLFKAYELKSHIKFTPIPIEEYEKWVAEQGKERFIITLLAHKLTALHLSKVSS\n+LIASQKLNIDTISRLSGRKSLNGNNKVTNSVVEFSVRGTPLDINAMKQSLMNIASETGID\n+IAFQEDNIYRRSRRLVCFDMDSTLIQTEVIDELAQKAGVGDEVKKITESAMRGEIDFKES\n+FKKRVSLLKGLDESVMKGIAENLPITNGAERLLSTLKQYGYRTAILSGGFTYFGNYLKTK\n+LGFDYVFANELEIKNGKLTGKHLHEIVDGKRKAELLELLAFKEDIHLEQVIAVGDGANDL\n+PMLEKAGLGIAFHAKPKVKASAQHAISATGLDTILYLLGFRDREINAS*\n+\n+>10FR_NODE_6#PROKKA_00004\n+MLIQEASKWDIITYVLDNDETCPANSLATHFVKGSNLDFDSVYRFGKMVDLLTYEMENIN\n+IEALKKLKSEGHQIIPDPDILELIQDKGKQKEFYQDNNVPTAPFKIYSSRQDIVQAIKNG\n+EIKFPFVQKLRTGGYDGRGVAVISDENDLDKLLDGASIIEDKVNIAKEIAVIAARNKQGE\n+IKCFPVVEMVFDPEANLVDKLICPSKITAEQSEKAIEIAGKIIGLLGMQGLLAVEFFVDE\n+NGEVIVNESAPRPHNSGHHTIESIITSQFEQHLRAIFNLPLGSTRPKLPAVMVNILGGEG\n+YEGPVRYEGLTEIMAIEGVKIHLYGKKITRPFRKMGHITVLSDSLETALEKAEKVKQLIK\n+VKSWDKN*\n+\n+>10FR_NODE_6#PROKKA_00005\n+VGQKLVSIVMGSDSDLPVMKPAAEMLEQLGVEYEIDIVSAHRTPEKLFDFASNAHKRGIQ\n+VIIAGAGGAAHLPGMVASMSPLPVIGVPVKSSNSIDGWDSVLSILQMPGGVPVATVALNG\n+AKNAGILAAQIISVSDSQVREKIIEYKAGLKEAVMKKAKNLKG*\n+\n+>10FR_NODE_6#PROKKA_00006\n+MDPKTVIRELLTGTGVHLNGPHPYDVQVHDERAYERWLSEAELGLGESYMDGWWDCLALD\n+EFIERILRAGLEEKVKRNFSTAFYVLSKRLFNQQTRVKSKRVGREHYDLGNELFSKMLDR\n+RMVYSCGYWQRAKNIDQAQEAKLDLICKKLNLKPGMKVLDIGCGWGSFAKYAAEKYGVEV\n+LGVSISKRQIELGNELCKGLPVTLLYKDYRDVEGKFDAVVSVGFFEHVGYKNYDTYMKIV\n+DRCLTDNGISLLHTIGNNTTTHYVNRWTNKYIFPNGMLPSIAQVAKAAEPYFVIEDFHNF\n+GPDYDKTLMAWYDRFNKAWKELKNQYDERFYRMWRYYLLSSAGGFRSRATQLWQFVMTRT\n+GRQQPDCRFA*\n+\n+>10FR_NODE_6#PROKKA_00007\n+VAHAATTGNNETLLCRFPTLHNNTIVFEAGGNLWRVDRTGGVATRLTTDPGYDMMPRFSP\n+DGKTIAFTGQYSGNVDVYTIPADGGAVTRLTYHSDVVRKAPTRWGPDNMVMTWTPNGKDI\n+VFLSRRDTWNSWFGQPFEVSKMGGLPTHLPLPKGGVMSYSPDGSKIAYNRIFRNFRTWKR\n+YKGGLAQDIWIYDFKTKKIQRVTKWKGTDTYPMWYKNTIYFASDRGANHRLNIWAYSLDT\n+KTFRQITHFKNYDVDWPSLGNNGIVFQDGGSLYVLDLPSEQLHKINVKVPTDGTQTLPRW\n+INASKMIRSLDISPNGKRVLFGARGDIFTVPAKHGATRDITQTSDAQEQYPAWSPNGKWI\n+AYLTDASGVNELAIRPSDGSGHQTYITNAKTGYYYNPTWSPNSQMLAYSDNNHVLWYISL\n+KDKKPVRIAQDKYNAMRDYHWSPDNNWISYSKTNASGLSQIYIYSLADHKSYKVSDGIYS\n+DNDPVFGPNGKYLFFVSARHENPLFSESESNVATEKMDGIYMVTLQKNEKSPFAPVSDEG\n+MPEAKKASSSASKKTESAKDVKIDFNGLMNRVIMLPIKSGDYGNIQVTGNKVFYQTRPLI\n+TIEGFLHGTGQSSIMVYDLKSKKGHTVVANGARTYGLSADGKTLVYMRRGKFFLMPSASV\n+NAKGSEPVNTSHMKMKIYP'..b'OKKA_00395\n+VYESRPQTTPEQHSSSIEGSHPTDAAAAAGSRSVELAQTDFLLRLTDALNTTLNLQTLLQ\n+RTADLVRTVIDYRIFAILLLDNRTNDLRMRFQTGHRPEVERMRIRLGQGVTGQVALTRKP\n+MLIPDVRDVENYINANPDVHSEIAIPLIVKNRVIGVIDIQSEQPNYFQPDHLHLLTLTAS\n+RIAHAIDNARLYTRVSRQAQTLEVLNEISRDLSSILDTDRLFERISQLLRRLFDYQMFTI\n+WTVRPIEHVLENRFALRFGERYYPNETIPVERGIVGAAIAERRPMNIPDVRRDPRYHKVN\n+EETRSELAVPLMYKSKVVGVLDVEHTQPHYFSEDHVRALTTLAAQIAIAIENAQLYQRVI\n+QQEQRLDHDLQMAREVQLRLLPPSLPSRPHAEFAARFLPARTIGGDLYDYLNYDDQRGAL\n+AIGDVSGKGAAAALYGAVVSGTMRSQASLKPSPAAMLQALNASLHERRLDSQYVAMLYTV\n+WNDENLTLQIANAGSVQPIYCRSGEIETVPVEGFPLGMFPQAEYEEISLSMRPGDSVFFF\n+SDGITDGENEAGESFEERLTDSIARHHHLPAEEAVNAVFEELQEFQGDCDRFDDETLIAL\n+RVV*\n+\n+>unf866_NODE_7#PROKKA_00396\n+MPDQTQFPPAESLRIRFTRDQIQSRVREMGRHIREDLHGESVVLVGVLKGAAIFLADLAR\n+AINLDCTFDFVAVSSYKSGTRSSGAVQLIKDLTEPIEGRHIIVVEDILDTGVTLSFLQSH\n+FERHNPRSIRVAALLDKPSRRIRPIQGHYIGFSIPNEFVVGYGMDYAERYRNLPDIRILD\n+SIPAS*\n+\n+>unf866_NODE_7#PROKKA_00397\n+MYPFIHIGHFTIPTYGIMMWLAAVAGCIVLYRNFKRWKVEGDAITIVAFATVIGIIGGKL\n+YHVLEKPVLLMHHPALLISRSGFAWYGGMIAGILALLFQAGTYNIRPLRMLDLCVPSAAL\n+GYGIGRLGCFFSGDGGYGPPTKMWFGMSFPHGTVPTTQKVYPTPLFEFVAAVIIFYILWR\n+RSRPAAERKLGHMTAEYLLLAGGARFLFEFIRINPKIFLGLSNAQWASIAEMLGGTALLW\n+WSRKYASTPQPGQQGRQPKEEPALVAAGDSGGPPTAEQTQ*\n+\n+>unf866_NODE_7#PROKKA_00398\n+VSACSHRPDARPLRIGQAQKDLRIPANKLHQEASPAGPQQILGRHLSQLPRRPRPAPPPE\n+DIEDNAGPDKLKNRRRIHFLCCRHNSVRIAHPEPHLRRNAVIPVSGQLASNAPDSVPQRR\n+RGHAQIQHPQGTNLIGPGLEQQRQNPGNHPAKPGKPGPADQQRWMMHQQHWLFQHMVELC\n+AHHSSHRRKGDDADRIGINLPALEVLVKEVAPDHRGKPHHDAVCANR*\n+\n+>unf866_NODE_7#PROKKA_00399\n+VRLAQDAPVSFDPDTFKVTPPDPAVITPVLAQLEFNQLLNQFAAPPPKADYRRLSDPEEI\n+EDFLKPVARKKRLAIDTETTSIQPMLAELVGVSLCHQAGEAVYIPVAHNLTPGQSQADKE\n+AVLQTLAPVLADPAVTKIGQNIKYDLIVLGRCGMEINGPLFDTMVASYLLNPGKTSHNLA\n+SIAAEFLGRSVISYQEATGGKNRPFADTDLDQATDYAAEDADVAWQAAQVLEKKLAESHL\n+DGLFRDLEMPLVPVLARMERNGVGLDVQGLEDLGKELAAKLDEIERTCYRLAGHEFNLNS\n+PKQLAQVLFEELGLTPVKKTKKGKTSSTDVSVLTVLAAKHPLPAEVLNYRTLSKLKSTYI\n+DTLPKLVNPQTGRLHTSFNQAVTATGRLSSSDPNLQNIPVRSEIGERIRACFIAEKGNLL\n+VSADYSQIELRVLAHLSRDPLLVEDLTKGLDVHTQTAARLFDVMPELVTKPMRARAKTVN\n+FGILYGMSAFRLAREQGISRKEAQQIIDKYLGRYQGVARFQEENLRQAREKGYVTTLLGR\n+RRYLPAINAGDRLARQAAERMALNTPIQGTAADIIKLAMLAAHRLLEERFPQALMILQIH\n+DELLFEVPASQAEDLAQAVKQAMEGVIELAVPLVVDIGIGPDWAQAH*\n+\n+>unf866_NODE_7#PROKKA_00400\n+MDHGKHKLATRLIHAGEPQEPVEGAVTLPIFQSSTFVYQGQASYHDLKYIRLNNTPNHRA\n+LHHKLAALENAESALVTASGMAAISATLLALSKSGDHLLCQDCLYGGTLDLITKDLAALG\n+IGHDFLDPERPESWAELLKPTTRLIYVETISNPLMQVMDLEAVVGFAREHGLVSVIDNTF\n+ATPVNFRPAEWGFDLSLHSGTKYLNGHSDIVAGAVIGRAELVERITHKLNHLGGSLDPHA\n+CFLLHRGLKTLAVRMDWHNRSAQRLAEFLAGHPAVTRVNYPGLPDAPDHQRASHLLDGFS\n+GMLSFELRGGVAAAERFLSRVRLPYLAPSLGGVESLVTRPATTSHAGLSPQERQAAGISE\n+SLIRVSVGLEDPDDLVADFDQALA*\n+\n+>unf866_NODE_7#PROKKA_00401\n+MSAMGISRTVRRTVVLVAALSLLLSASAWAAPQRVAVLPFTANAKEDISYLVKGVRDMLA\n+SRLAWQDKVVVIEPDLVAPVMKEVPPPYNEAKARKLGNKLSADVVVYGSITALGSTVSVD\n+AQVIKVKGKQPPLSTFVQAADLNQVIPQINDFAKRINAEIFRRPEAVAALQKQGQQAEKG\n+KQEAGSSGKPLVEAPKTPAAEWQQKRAVEVGKLPPNISPLNPLFLRSLSGVDSDRYWRSP\n+RIDGVVSSLAVGDIDNDGRNEMVVLLHKRIRVYRLDGQRFGLIHEFKKGPDGEYLFVDIA\n+DLDGNGRPEIFVSSIINGEIVSFVLEWGEGGLGIKAKDIPWFFRVQPNPTGKGNIVWGQG\n+KSINAAFAGPVYRMKYENGQYVPGEPIRLPEYANVFNFVKADLNGSGRPMTVMVAPGFRL\n+KVFGKPDDELFASGEMYAGSSKFIEVPSHSDPSNPGDEPAREFLPTRLIVNDLDKDGRSE\n+IVVVRNKDSLQGIMENMQFFYQGTIYSLYWNGMSLLENWRTPRISGYLTDYTIADVGNVG\n+RPALVLSVVQTKYGGMVEKGFSHIVAFTLKPQAKKKKHYIKRTKGL*\n+\n+>unf866_NODE_7#PROKKA_00402\n+MTPLPRLTSRRGFPWPRKVHEEMERTYYQERSQHGHITGPGLVHVVYGQGVGKTSRCVGL\n+AVRAAGAGLKVAWVQFMKDDTSSEVKVLRDLAGVHYFCPGPHAFITKKGPGDEHRQHARR\n+ALEHARELVEKDRVQVLICDEILNTLLFKVLPLEEVMALVELCRGRVELVMSGADVHPDL\n+LDAADYATELVQRKHPYYRGIEARKGIEF*\n+\n+>unf866_NODE_7#PROKKA_00403\n+MPKPVISYHPALEADQSFLLRSRRPLERRDLLAILRAGAVLLPQAPRADLYLLVAGMGRP\n+HFPRAAVYFSLDGKVGNHRLFSALGLPQPPTLSFENLEQALAAWREGGLEAAGITPPLVV\n+KGAGGGEGSNVFLVRDIGELAGLAGRVETFCARGPSGLVVQKYLDGGGGDARVVLLGRSC\n+EAFWRRSAPGEFRSNLSSGGRVDRRWRPAELERAVEPARRPQGATGVVVAAVEILVPPGG\n+EPLLLELNFYFGRRALGGSETFLRRYLAAVRRWLEGLGLDPRRVQLYE*\n+\n+>unf866_NODE_7#PROKKA_00404\n+MTTPRETPLENPYSQSGCFFCGQDNPVGLKLRFARVEGKEELVCRWRPDRRYLGLGRVLH\n+GGIQCGLFDEIMGWTAHHFSQGPGVTQEVSVRYLAPLFIDRPLELRCRVVERKERRIFME\n+AEIRDHQGRVCSRARGSYALMDPERFARLVQDQPEPPPAE*\n+\n'
b
diff -r 000000000000 -r 68a3648c7d91 pfam_search/lipase
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pfam_search/lipase Thu Dec 22 04:45:31 2016 -0500
b
@@ -0,0 +1,162 @@
+<html>
+<head></head>
+<body>
+Proteins with PFAM domains:
+<br><br>
+<div>
+<table cellpadding="0" width=650>
+<td>
+<HR SIZE=3 WIDTH=80%>
+<center><b>15FR_NODE_1#PROKKA_00113</b><br>
+</center>
+</td
+<tr></tr>
+<td bgcolor="#czb9dz">
+<pre> 
+VLAGIAEWYGEDEAVPFRVISGTSAGAMNAAYLSANMENFAHGTQRLAQVWSQLEAQQVYRPEYRKVFGALLHWAWSLLSGGLGDSNPRS
+LLDNSPLRALLAENIDFDAIARNIERGLLRGVSVTVAGYSTERSLSYFQAETGVQSWWRQRREGRPVQMTLDHVMASLGLPIIFPAVKVA
+GEWCGDGSTREFAPLSPAIHLGAKRVLVIDTQYPAPQHVLGQDQAYPSLSKIMGYLFDSVFSDSLYADLERTKRINRTLDYIKRQSGHEP
+PELGLSHIDTLVIAPSRRPLEIASRYESHLPKSMRWILRSLGGDVSSGDQLLSYMLFQSGYCSEMVALGRHDAHARREEIGQFLGLSKIK
+V
+
+ </pre>
+</td>
+<tr></tr>
+<td></td>
+<tr></tr>
+<td>
+<p align="left">
+<a href=http://pfam.xfam.org/family/PF01734> PF01734</a>
+<p align="justify">Patatin-like phospholipase<br>Pfam-B_2206 (release 4.1) This family consists of various patatin glycoproteins from plants. The patatin protein accounts for up to 40% of  the total soluble protein in potato tubers  . Patatin is a storage protein but it also has the enzymatic activity of lipid acyl hydrolase, catalysing the cleavage of fatty acids from membrane lipids  . Members of this family have been found also in vertebrates. </p>
+
+</p>
+</td>
+<tr></tr>
+<td></td>
+<tr></tr>
+<td bgcolor="#czb9dz">
+<p align="left">
+<a href=http://pfam.xfam.org/clan/CL0323> CL0323</a>
+ <p align="justify">Patatin/FabD/lysophospholipase-like superfamily This superfamily of enzymes contains a Ser/Asp catalytic dyad. Members of this superfamily are all serine acylhydrolase enzymes. </p>
+
+</p>
+</td>
+<tr></tr>
+<td></td>
+<tr></tr>
+<td>
+<HR SIZE=3 WIDTH=80%>
+<center><b>16FR_NODE_10#PROKKA_00133</b><br>
+</center>
+</td
+<tr></tr>
+<td bgcolor="#czb9dz">
+<pre> 
+MNPIEAISHTGRSVRSRLKGFPRKKVLVLEGGGMRGIFTVGVLQAFSERGYAPWKTIIGASAGALSGVVYAAGQIHMARDAFFTELISGR
+FIRMSNIFRPEKHILNLDWLVDHIIGGDEPLNIRRLRTTACPVLITVTRFSRDFPPDTLYLSTKTDSVPQALKATAAIPFFYRGFVHYRN
+DLLLDGGVLDSVPFKKALSMGFPERDILVVLTRPKGYRKERDSFWIKTLYESYYKDSQYRYLVNSLEHHFGNYNRMLDDLETNYDFDIIY
+PPDNFKVNRLTRSEDKIVDGFEQGVAAAKAYLKPK
+
+ </pre>
+</td>
+<tr></tr>
+<td></td>
+<tr></tr>
+<td>
+<p align="left">
+<a href=http://pfam.xfam.org/family/PF01734> PF01734</a>
+<p align="justify">Patatin-like phospholipase<br>Pfam-B_2206 (release 4.1) This family consists of various patatin glycoproteins from plants. The patatin protein accounts for up to 40% of  the total soluble protein in potato tubers  . Patatin is a storage protein but it also has the enzymatic activity of lipid acyl hydrolase, catalysing the cleavage of fatty acids from membrane lipids  . Members of this family have been found also in vertebrates. </p>
+
+</p>
+</td>
+<tr></tr>
+<td></td>
+<tr></tr>
+<td bgcolor="#czb9dz">
+<p align="left">
+<a href=http://pfam.xfam.org/clan/CL0323> CL0323</a>
+ <p align="justify">Patatin/FabD/lysophospholipase-like superfamily This superfamily of enzymes contains a Ser/Asp catalytic dyad. Members of this superfamily are all serine acylhydrolase enzymes. </p>
+
+</p>
+</td>
+<tr></tr>
+<td></td>
+<tr></tr>
+<td>
+<HR SIZE=3 WIDTH=80%>
+<center><b>16FR_NODE_10#PROKKA_00142</b><br>
+</center>
+</td
+<tr></tr>
+<td bgcolor="#czb9dz">
+<pre> 
+MGKTIRRALVLSGGGARGAFEVGVMRYLNEVNWQPDLICGTSIGAINGAAFGSGMSVDELAHLWKTYHRKQMYKITFPAFFRTLLSGRKF
+SPLSDNRPTRSLLEKTIDIDALRNSTTEIIISVLNMRTSQVRYFTHKAIGIEHLMAAGGIPMMFPWQYIDGDPYWDAGVMVNTPIMPAFE
+RGATEIIVVLLSPLGAIPQRLPSTHREVSELVFEQFLIGSYTACLPNAGWRTNPEADVYDTPLPDSPQLQLSMKGVRMATVYPTRMLGFR
+SLLDFSPRQAKTLLRDGYVNARMQLKSFF
+
+ </pre>
+</td>
+<tr></tr>
+<td></td>
+<tr></tr>
+<td>
+<p align="left">
+<a href=http://pfam.xfam.org/family/PF01734> PF01734</a>
+<p align="justify">Patatin-like phospholipase<br>Pfam-B_2206 (release 4.1) This family consists of various patatin glycoproteins from plants. The patatin protein accounts for up to 40% of  the total soluble protein in potato tubers  . Patatin is a storage protein but it also has the enzymatic activity of lipid acyl hydrolase, catalysing the cleavage of fatty acids from membrane lipids  . Members of this family have been found also in vertebrates. </p>
+
+</p>
+</td>
+<tr></tr>
+<td></td>
+<tr></tr>
+<td bgcolor="#czb9dz">
+<p align="left">
+<a href=http://pfam.xfam.org/clan/CL0323> CL0323</a>
+ <p align="justify">Patatin/FabD/lysophospholipase-like superfamily This superfamily of enzymes contains a Ser/Asp catalytic dyad. Members of this superfamily are all serine acylhydrolase enzymes. </p>
+
+</p>
+</td>
+<tr></tr>
+<td></td>
+<tr></tr>
+<td>
+<HR SIZE=3 WIDTH=80%>
+<center><b>8FR_NODE_2#PROKKA_00232</b><br>
+</center>
+</td
+<tr></tr>
+<td bgcolor="#czb9dz">
+<pre> 
+MGLSYRINFKLGANAMVISDPSMPVPASLQFRQLKSDEYVMLDIPTDSLEVAYWGTKKPVPAQYVLTEAQTAKVESAITSYNAEIKSLAK
+KYNLAFVDFNSIMKSIEHGGLTVDGIHFTTAFITGNLFSLDGVHLTPQGNAVVANYFIQAINKQYGSHIPSVMVSDYPSVVF
+
+ </pre>
+</td>
+<tr></tr>
+<td></td>
+<tr></tr>
+<td>
+<p align="left">
+<a href=http://pfam.xfam.org/family/PF00657> PF00657</a>
+<p align="justify">GDSL-like Lipase/Acylhydrolase<br>Prosite & Pfam-B_543 (Release 7.5) </p>
+
+</p>
+</td>
+<tr></tr>
+<td></td>
+<tr></tr>
+<td bgcolor="#czb9dz">
+<p align="left">
+<a href=http://pfam.xfam.org/clan/CL0264> CL0264</a>
+ <p align="justify">This superfamily contains a diversity of hydrolytic enzyme activities. </p>
+
+</p>
+</td>
+<tr></tr>
+<td></td>
+<tr></tr>
+</table>
+</div>
+</body>
+</html>
b
diff -r 000000000000 -r 68a3648c7d91 pfam_search/lista
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pfam_search/lista Thu Dec 22 04:45:31 2016 -0500
b
@@ -0,0 +1,1 @@
+a
b
diff -r 000000000000 -r 68a3648c7d91 pfam_search/pfamA.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pfam_search/pfamA.txt Thu Dec 22 04:45:31 2016 -0500
[
b'@@ -0,0 +1,14831 @@\n+1\tPF00389\t2-Hacid_dh\t2-Hacid_DH; \tD-isomer specific 2-hydroxyacid dehydrogenase, catalytic domain\tFinn RD, Griffiths-Jones SR\tanon\tProsite\tDomain\tThis family represents the largest portion of the catalytic domain  of 2-hydroxyacid dehydrogenases as the NAD binding domain is  inserted within the structural domain.\t24.60\t24.60\t24.60\t24.60\t24.50\t24.50\thmmbuild  -o /dev/null --hand HMM SEED\t133\thmmsearch -Z 23193494 -E 1000 --cpu 4 HMM pfamseq\t0.71\t-9.58\t0.71\t-10.50\t0.71\t-4.69\t98\t16860\t2012-10-02 14:31:05\t2003-04-07 12:59:11\t25\t50\t4524\t180\t4691\t12854\t6366\t308.20\t17\t85.12\tCHANGED\tlllhp....sh..pptshphlcc.........plphtp....shsp-..clhcthps..s-ulhstsps.....plspcll.pth..spLKlluptusGhDslDlcsAsc+GIhVsNsPs.ssspulAEhsluhllulsRclspspppl+pGpapppthhshphtspshsllGhsthGttssthtpthththhhhshhhs.pttpttthhhht.thhh...................psspllshps.tshppthhhtpptttthhsshhlsssttsshhststhtshtpptthsssshssppp.sshtp.LhshsNVllTPHluus..TpEAppshutpsspslhphhpG....psstssVs\t.......................................................................................h.t.hpt..............h..pl.phhp.............hspp....ph..h....p...t..l...t.s...........s..-..s.l.h..s.p.st.s...........l..s.p..c..ll...pth............spL+..hl.u....p....h.u...s.G....h....-....s..l....D....l.s.s....A...p.c..........p...........G...........IhV............s........N.s......P.s......s...s..s....p......u.....V....A..EhsluhlLshsRplspsppph+pGpapttshhshthtstphtslshsthsthhtthtpthththhhh.hh.ttttttttthh..t..h..........................................................................................................................................................tst.hh.htshts.ppthhhspphhthhhsshhlhssttsshhppshhtshhpttthsssshss.tptsshtpsLhphsNVllTPHluus..T.tE.up.pp.hs.p...t.s.hpslh.p.hh.ps......t....p.l..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................\t0\t1361\t2755\t3878\n+2\tPF00198\t2-oxoacid_dh\t\t2-oxoacid dehydrogenases acyltransferase (catalytic domain)\tBateman A, Finn RD, Griffiths-Jones SR\tanon\tBateman A\tDomain\tThese proteins contain one to three copies of a lipoyl binding domain followed by the catalytic domain.\t23.00\t23.00\t23.10\t23.50\t22.90\t22.90\thmmbuild  -o /dev/null HMM SEED\t231\thmmsearch -Z 23193494 -E 1000 --cpu 4 HMM pfamseq\t0.70\t-10.60\t0.70\t-11.30\t0.70\t-5.09\t100\t10039\t2012-10-02 12:01:53\t2003-04-07 12:59:11\t18\t49\t3997\t65\t2920\t7770\t5825\t224.80\t35\t47.20\tCHANGED\tssstpplPlsshR+slAcphspStp.shPphshs...s-l-hspLhplRp.p..................lppph..tt........KlohhshllKAsuhALccaPhlNush..s..s-s.....llhpcplslGlAVsos.....pG..LlVPVl+sscptulhplupclpcLsp+ARss+.Lpss-hpG.GTFTlSNlGsh.G.sphhoPIIN..PpsAILulGpl.pcpP.ls.tssp.........lshpph.....hsloLohDHRllDGAsuucFlppl+.chLE..sPttll.l\t.......................................................t....ppl.h.stlR+tlApphhcuhp.ss.s..p...lThh...s-lDhst...lhshRp.p.......................................hpp.t.htcpps..............KLohhsahsKA.lstAL..+..c.a.P.tlNuuh.........s...............scs........llh+ph...hs..lG.......lAV..s...T..s............pG.....L.lV..P...V.l+..ss...-.p.h.ultpls.p.........c.l...pcLup....+.......AR.cG...K..Ls.....s...p-h..pG...GTFTI....oNhG...s........h....G..uh..h..T.PI....l.Ns..Pps.A.I.L.G.luph..tp+......P.ls....hssp................................lshcsh...h.LuLSaDHRllDGtpuu.pFLspl+.phLE..sPtthl.............................\t0\t940\t1801\t2459\n+3\tPF04029\t2-ph_phosp\t\t2-phosphosulpholactate phosphatase\tKerrison ND, Finn RD\tanon\tCOG2045\tFamily\tThought to catalyse 2-phosphosulpholactate = sulpholactate + phosphate. Probable magnesium cofactor.  Involved in the second step of coenzyme M biosynthesis.  Inhibited by vanadate in Methanococcu'..b'th intracellular and extracellular pathogens [1].\t25.00\t25.00\t28.30\t27.40\t20.40\t19.30\thmmbuild  -o /dev/null HMM SEED\t119\thmmsearch -Z 23193494 -E 1000 --cpu 4 HMM pfamseq\t0.71\t-9.62\t0.71\t-10.73\t0.71\t-3.90\t9\t20\t2012-10-09 19:11:59\t2012-10-09 20:11:59\t1\t5\t16\t0\t5\t19\t0\t116.40\t28\t18.40\tNEW\ttlaGscsphtshc.lptslpsIs+p.S..spsIhIlSGoHGhssG.........pNWhtps.........lRcPpl.h-hpFhtpDhpshpt....hscplhlhDlssss.tchss..lpss......ssphILuYCaStsDpsht.\t.........................lhstptth.sht.lptshphlt+p.s.sstcIhIlSGoHGtssG.........pNasups...............lRcPsl.hE+tFahEDlpsap.t...........hstpV+lhDlushop.pEhss.plpss......spplIhGaCaSpsDchhh.h....\t0\t1\t2\t4\n+15517\tPF15657\tTox-HNH-EHHH\t\tHNH/Endo VII superfamily nuclease toxins\tZhang D, de Souza RF, Anantharaman V, Iyer LM, Aravind L, Finn RD\trdf\t[1]\tFamily\tA predicted toxin of the HNH/Endonuclease VII fold present in bacterial polymorphic toxin systems with a characteristic conserved [ED]H motif and two histidine residues. In bacterial polymorphic toxin systems, the toxin is exported by the type 2, type 5, type 6, type 7 or Photorhabdus virulence cassette (PVC)-type secretion system [1].\t25.00\t25.00\t26.70\t26.60\t23.40\t23.40\thmmbuild  -o /dev/null HMM SEED\t72\thmmsearch -Z 23193494 -E 1000 --cpu 4 HMM pfamseq\t0.72\t-8.75\t0.72\t-9.70\t0.72\t-3.83\t33\t80\t2012-10-09 19:15:32\t2012-10-09 20:15:32\t1\t26\t69\t0\t26\t75\t0\t66.90\t30\t8.65\tNEW\ttlhcpsspshhsRpYcFpsscG.....ppllIp-HotGHths.....tspsPHFNsts.cs...........................hpsGphsspcsHYsa\t.......hhsspspshhsR.acapshcG.........pKhlIp-HotG+th.......spuPHapsts.pp............................h.c.puphstp.psHY.a.......................\t0\t2\t9\t18\n+15518\tPF15658\tLatrotoxin_C\t\tLatrotoxin C-terminal domain\tZhang D, de Souza RF, Anantharaman V, Iyer LM, Aravind L, Finn RD\trdf\t[1]\tFamily\tA toxin domain present in arthropod alphaproteobacterial, gammaproteobacterial endosymbionts and also at the C-termini of the latrotoxins of the black widow spider. The domain is characterized by a conserved, hydrophobic helix and is predicted to associate with the cell membrane [1].\t25.00\t25.00\t46.20\t43.40\t24.70\t22.60\thmmbuild  -o /dev/null HMM SEED\t127\thmmsearch -Z 23193494 -E 1000 --cpu 4 HMM pfamseq\t0.71\t-9.89\t0.71\t-10.46\t0.71\t-4.31\t8\t23\t2012-10-09 19:27:40\t2012-10-09 20:27:40\t1\t16\t8\t0\t8\t25\t0\t126.00\t38\t6.65\tNEW\tphDsNuslhLLDlLIRKlTspKYhsT....sc.polSPLEApGYALsIsKcFEcVlc.QAulKSGISh+cLNIDhlElQcpIssK..ItSGKFsEISulLsSYlEcAhPs......ucLS.Kph-KFhspFNscl-.....slLNp\t......hDVNGslhLLDlLIRKhoupKYhss....sc....pSISsLEAQuYALsIsc+FEcVLp.psulKuGlShcpLNlDhstlQpcIhtK..lhuG+FsEIuthLsSasccAhPt......upLp.KphcKFh.pFppthc.hlp.............\t0\t0\t2\t2\n+15519\tPF15659\tToxin-JAB1\t\tJAB-like toxin  1\tZhang D, de Souza RF, Anantharaman V, Iyer LM, Aravind L, Finn RD\trdf\t[1]\tFamily\t\\N\t26.40\t26.40\t26.40\t26.60\t25.60\t25.70\thmmbuild  -o /dev/null HMM SEED\t162\thmmsearch -Z 23193494 -E 1000 --cpu 4 HMM pfamseq\t0.71\t-10.07\t0.71\t-11.36\t0.71\t-4.36\t13\t24\t2012-10-10 14:49:21\t2012-10-09 20:38:55\t1\t2\t16\t0\t6\t24\t1\t153.20\t26\t34.55\tNEW\tp+ss+las....phtsusps-phhssp.ts..psIpl.spsll...uphsp..tpsppG....thpoas....oTssspsAtslFcFsA-NTo..VEWpLsshp.-sGs.pshsltTspcptuspss..hsph.pc.htssuphlhc..IHSH...........Ptss.....tsS.....hsDhphup.tps..........huhYhpc.......tthhphYs\t......................ppssclhp.....hhpssps-phh.hp.ts..psIpl.sppll...sthhp..tpsppG....hhpoas....sTss.csAtslFcFsA-NTo..VEWpLssht.-p....Gs.pshhlsTsppppuVt....hs.htpc.h.stsphlIc..hHSH...........Phss....tsS.....spDhpshp.hps...........shahcc........t.h..Y................\t0\t6\t6\t6\n+15520\tPF15660\tImm49\t\tImmunity protein 49\tZhang D, de Souza RF, Anantharaman V, Iyer LM, Aravind L, Finn RD\trdf\t[1]\tFamily\t\\N\t25.00\t25.00\t168.90\t168.80\t24.80\t19.60\thmmbuild  -o /dev/null HMM SEED\t84\thmmsearch -Z 23193494 -E 1000 --cpu 4 HMM pfamseq\t0.72\t-9.17\t0.72\t-9.79\t0.72\t-3.94\t4\t10\t2012-10-09 20:02:34\t2012-10-09 21:02:34\t1\t1\t10\t0\t0\t10\t0\t82.90\t99\t96.96\tNEW\thRALVALKRELLPGVTTFIDSVRLEAIDDKADRLMVTTSVGEEARLVYFNPDFAGTPTFGRRLYRLRDWTDDLADWVDRLRRER\tVRALVALKRELLPGVTTFIDSVRLEAIDDKADRLMVTTSVGEEARLVYFNPDFAGTPTFGRRLYRLRDWTDDLADWVDRLRRER\t0\t0\t0\t0\n'
b
diff -r 000000000000 -r 68a3648c7d91 pfam_search/pfamA.txt.gz
b
Binary file pfam_search/pfamA.txt.gz has changed
b
diff -r 000000000000 -r 68a3648c7d91 pfam_search/pfam_filter.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pfam_search/pfam_filter.xml Thu Dec 22 04:45:31 2016 -0500
b
@@ -0,0 +1,17 @@
+<tool id="pfam_Filter" name="PFAM search" version="0.">
+ <description>keyword search on PFAM annotations</description>
+ <command> /home/inmare/galaxy/tools/pfam_search/annota.Filter.pl $prot $pfam $out $search</command>
+ <description> "approved by the boss" </description>
+ <inputs>
+  <param name="prot" type="data" format="fasta" label="protein file used as input with pfamScan" help="fasta only"/>
+  <param name="pfam" type="data" format="tabular" label="pfamScan output" help="this file should match the protein file"/>
+  <param name="search" type="text" label="regular expression to search" help="case sensitive!"/>  
+ </inputs>
+ <outputs>
+  <data name="out" ftype="tabular" format="html" label="annotated pfam file"/>
+ </outputs>
+ <test/>
+ <help>
+ This tool performs keyword searches in order to facilitate the retrieval of proteins containing functional domains of interest from a PFAM annotation. Keyword searches are performed through a simple Perl script implementing exact searches of user specified keywords within the description of each PFAM domain as reported in the clans.txt or pfam.txt domain description files.  Searches can be performed using logical connectors, AND, OR and NOT.  When multiple keywords are entered, the default is to use the AND connector. 
+ </help>
+</tool>
b
diff -r 000000000000 -r 68a3648c7d91 pfam_search/prots
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pfam_search/prots Thu Dec 22 04:45:31 2016 -0500
b
@@ -0,0 +1,80 @@
+>PROKKA_00001 Mercuric resistance operon regulatory protein
+MENNLENLTIGVFAKAAGVNVETIRFYQRKGLLREPDKPYGSIRRYGEADVVRVKFVKSA
+QRLGFSLDEIAELLRLDDGTHCEEASSLAEHKLKDVREKMADLARMETVLSELVCACHAR
+KGNVSCPLIASLQGEAGLARSAMP*
+>PROKKA_00002 MerT mercuric transport protein
+MSEPQNGRGALFAGGLAAILASTCCLGPLVLVALGFSGAWIGNLTVLEPYRPLFIGAALV
+ALFFAWKRIYRPVQACKPGEVCAIPQVRATYKLIFWIVAVLVLVALGFPYVVPFFY*
+>PROKKA_00003 Mercuric transport protein periplasmic component precursor
+MKKLFASLALAAAVAPVWAATQTVTLAVPGMTCAACPITVKKALSKVEGVSKVDVGFEKR
+EAVVTFDDTKASVQKLTKATADAGYPSSVKQ*
+>PROKKA_00004 Mercuric resistance protein MerC
+MGLMTRIADKTGALGSVVSAMGCAACFPALASFGAAIGLGFLSQYEGLFISRLLPLFAAL
+AFLANALGWFSHRQWLRSLLGMIGPAIVFAATVWLLGNWWTANLMYVGLALMIGVSIWDF
+VSPAHRRCGPDGCELPAKRL*
+>PROKKA_00005 Mercuric reductase
+MSTLKITGMTCDSCAVHVKDALEKVPGVQSADVSYAKGSAKLAIEVGTSPDALTAAVAGL
+GYRATLADAPSVSTPGGLLDKMRDLLGRNDKTGSSGALHIAVIGSGGAAMAAALKAVEQG
+ARVTLIERGTIGGTCVNVGCVPSKIMIRAAHIAHLRRESPFDGGIAATTPTIQRTALLAQ
+QQARVDELRHAKYEGILEGNPAITVLHGSARFKDNRNLIVQLNDGGERVVAFDRCLIATG
+ASPAVPPIPGLKDTPYWTSTEALVSETIPKRLAVIGSSVVALELAQAFARLGAKVTILAR
+STLFFREDPAIGEAVTAAFRMEGIEVREHTQASQVAYINGEGDGEFVLTTAHGELRADKL
+LVATGRAPNTRKLALDATGVTLTPQGAIVIDPGMRTSVEHIYAAGDCTDQPQFVYVAAAA
+GTRAAINMTGGDAALNLTAMPAVVFTDPQVATVGYSEAEAHHDGIKTDSRTLTLDNVPRA
+LANFDTRGFIKLVVEEGSGRLIGVQAVAPEAGELIQTAALAIRNRMTVQELADQLFPYLT
+MVEGLKLAAQTFNKDVKQLSCCAG*
+>PROKKA_00006 zinc-responsive transcriptional regulator
+MSAYTVSQLAHNAGVSVHIVRDYLVRGLLRPVACTTGGYGVFDDAALQRLCFVRAAFEAG
+IGLDALARLCRALDAADGAQAAAQLAVLRQLVERRRAALAHLDAQLASMPAERAHEEALP
+*
+>PROKKA_00007 MerE protein
+VNAPDKLPPETRQPVSGYLWGALAVLTCPCHLPILAAVLAGTTAGAFLGEHWGVAALALT
+GLFVLAVTRLLRAFRGGS*
+>PROKKA_00008 Phytochrome-like protein cph2
+MTSSQPAGWTAAELAQAAARGQLDLHYQPLVDLRDHRIAGAEALMRWRHPRLGLLPPGQF
+LPLAESFGLMPEIGAWVLGEACRQMHKWQGPAWQPFRLAINVSASQVGPTFDDEVKRVLA
+DMALPAELLEIELTESVAFGNPALFASFDALRAIGVRFAADDFGTGYSCLQHLKCCPITT
+LKIDQSFVARLPDDARDQTIVRAVIQLAHGLGMDVIFRRRLHQLIGRNGCCAASS*
+>PROKKA_00009 Transposon Tn7 transposition protein TnsB
+MATDTPRIPEQGVATLPDEAWERARRRAEIISPLAQSETVGHEAADMAAQALGLSRRQVY
+VLIRRARQGSGLVTDLVPGQSGGGKGKGRLPEPVERVIHELLQKRFLTKQKRSLAAFHRE
+VTQVCKAQKLRVPARNTVALRIASLDPRKVIRRREGQDAARDLQGVGGEPPAVTAPLEQV
+QIDHTVIDLIVVDDRDRQPIGRPYLTLAIDVFTRCVLGMVVTLEAPSAVSVGLCLVHVAC
+DKRPWLEGLNVEMDWQMSGKPLLLYLDNAAEFKSEALRRGCEQHGIRLDYRPLGQPHYGG
+IVERIIGTAMQMIHDELPGTTFSNPDQRGDYDSENKAALTLRELERWLTLAVGTYHGSVH
+NGLLQPPAARWAEAVARVGVPAVVTRATSFLVDFLPILRRTLTRTGFVIDHIHYYADGHC
+CK*
+>PROKKA_00010 Integrase core domain protein
+MNPFKGRHFQRDIILWAVRWYCKYGISYRELQEMLAERGVNVDHSTIYRWVQRYAPEMEK
+RLRWYWRNPSDLCPWHMDETYVKVNGRWAYLYRAVDSRGRTVDFYLSSRRNSKAAYRFLG
+KILNNVKKWQIPRFINTDKAPAYGRALALLKREGRCPSDVEHRQIKYRNNVIECDHGKLK
+RIIGATLGFKSMKTAYATIKGIEVMRALRKGQASAFYYGDPLGEMRLVSRVFEM*
+>PROKKA_00011 DNA-binding transcriptional regulator LysR
+MKLRHLDIFYAVMTCGSLTRAAEVLHISQPAASKALKHAEH*
+>PROKKA_00012 hypothetical protein
+MPSRFLTPYIPLVNLFSLHVYELILVTTKPKFEL*
+>PROKKA_00013 Sodium/glutamate symport carrier protein
+MILDASYTLLVACIALLIGMFVVKFTPFLQKNHIPEAVVGGFIVAIVLLIIDKTSGYSFT
+FDASLQSLLMLTFFSSIGLSSDFSRLIKGGKPLVLLTIAVTILIAIQNTVGMSMAVMMNE
+SPFIGLIAGSITLTGGHGNAGAWGPILADKYGVTGAVELAMACATLGLVLGGLVGGPVAR
+HLLKKVSIPKTTEQERDTIVEAFEQPSVKRKINANNVIETISMLIICIVVGGYISALFKD
+TFLQLPTFVWCLFVGIIIRNTLTHVFKHEVFEPTVDVLGSVALSLFLAMALMSLKFGQLA
+SMAGPVLIIIAVQTVVMVLFACFVTFKMMGKDYDAVVISAGHCGFGMGATPTAIANMQTV
+TKAFGPSHKAFLVVPMVGAFIVDISNSILIKIFIEIGTYFT*
+>PROKKA_00014 Antibiotic biosynthesis monooxygenase
+MIAVIFEVQIQPDQQTRYLTLAEELRPLLSHVAGFISIERFQSLATEGKMLSLSWWENEY
+AVLQWKNHVLHAKAQQEGRESIFDFYKISIAHITREYSFKKDKDNV*
+>PROKKA_00015 hypothetical protein
+MFDVHVVLDNQIGQLALLGKTLGNKGIGLEGGGIFTVGDECHAHFLVEQGKEAKIALEQA
+GLLVLAIRTPLIRKLKQEKPGELGEIARVLAENNINILVQYSDHANQLILITDNDSMAAS
+VTLPWAIK*
+>PROKKA_00016 Helix-turn-helix domain protein
+MSDISRVKILSALMDGRAWTATELSSVANISASTASSHLSKLLDCQLITVVAQGKHRYFR
+LAGKDIAELMESMMGISLNHGVHARVSTPVHLRKARTCYDHLAGEVAVKIYDSLCQQQWI
+TENGSMITLSGIQYFHEMGIDVPSKHSRKICCACLDWSERRFHLGGYVGAALFSLYESKG
+WLTRHLGYREVTITEKGYAAFKTHFHI*
+>PROKKA_00017 Tetracycline repressor protein class B from transposon Tn10
+MSRLDKSKVINSALELLNEVGIEGLTTRKLAQKLGVEQPTLYWHVKNKRALLDALAIEML
+DRHHTHFCPLEGESWQDFLRNNAKSFRCALLSHRDGAKVHLGTRPTEKQYETLENQLAFL
+CQQGFSLENALYALSAVGHFTLGCVLEDQEHQVAKEERETPTTDSMPPLLRQAIELFDHQ
+GAEPAFLFGLELIICGLEKQLKCESGS*
b
diff -r 000000000000 -r 68a3648c7d91 pfam_search/prova
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pfam_search/prova Thu Dec 22 04:45:31 2016 -0500
[
b'@@ -0,0 +1,536 @@\n+<html>\n+<head></head>\n+<body>\n+Proteins with PFAM domains:\n+<br><br>\n+<div>\n+<table cellpadding="0" width=950>\n+<td>\n+<HR SIZE=3 WIDTH=80%>\n+<center><b>PROKKA_00001</b><br>\n+</center>\n+<td bgcolor="#czb9dz">\n+<pre> \n+MENNLENLTIGVFAKAAGVNVETIRFYQRKGLLREPDKPYGSIRRYGEADVVRVKFVKSAQ<br>RLGFSLDEIAELLRLDDGTHCEEASSLAEHKLKDVREKMADLARMETVLSELVCACHARK<br>GNVSCPLIASLQGEAGLARSAMP*\n+ </pre>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<p align="left">\n+<a href=http://pfam.xfam.org/family/PF00376> PF00376</a>merR; <br>MerR family regulatory protein Prosite & Pfam-B_3021 (Release 7.5) \n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td bgcolor="#czb9dz">\n+<p align="left">\n+<a href=http://pfam.xfam.org/clan/CL0123> CL0123</a> This family contains a diverse range of mostly DNA-binding domains that contain a helix-turn-helix motif. \n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<p align="left">\n+<a href=http://pfam.xfam.org/family/PF09278> PF09278</a>MerR, DNA binding<br>Members of this family of DNA-binding domains are predominantly found in the prokaryotic transcriptional regulator MerR. They adopt a structure consisting of a core of three alpha helices, with an architecture that is similar to that of the \'winged helix\' fold  . \n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td bgcolor="#czb9dz">\n+<p align="left">\n+<a href=http://pfam.xfam.org/clan/CL0123> CL0123</a> This family contains a diverse range of mostly DNA-binding domains that contain a helix-turn-helix motif. \n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<HR SIZE=3 WIDTH=80%>\n+<center><b>PROKKA_00002</b><br>\n+</center>\n+<td bgcolor="#czb9dz">\n+<pre> \n+MSEPQNGRGALFAGGLAAILASTCCLGPLVLVALGFSGAWIGNLTVLEPYRPLFIGAALVA<br>LFFAWKRIYRPVQACKPGEVCAIPQVRATYKLIFWIVAVLVLVALGFPYVVPFFY*\n+ </pre>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<p align="left">\n+<a href=http://pfam.xfam.org/family/PF02411> PF02411</a>MerT mercuric transport protein<br>MerT mercuric transport protein MerT is an mercuric transport integral membrane protein and  is responsible for transport of the Hg2+ iron from periplasmic  MerP (also part of the transport system) to mercuric reductase (MerE). \n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<HR SIZE=3 WIDTH=80%>\n+<center><b>PROKKA_00003</b><br>\n+</center>\n+<td bgcolor="#czb9dz">\n+<pre> \n+MKKLFASLALAAAVAPVWAATQTVTLAVPGMTCAACPITVKKALSKVEGVSKVDVGFEKRE<br>AVVTFDDTKASVQKLTKATADAGYPSSVKQ*\n+ </pre>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<p align="left">\n+<a href=http://pfam.xfam.org/family/PF00403> PF00403</a>Heavy-metal-associated domain<br>\n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<HR SIZE=3 WIDTH=80%>\n+<center><b>PROKKA_00004</b><br>\n+</center>\n+<td bgcolor="#czb9dz">\n+<pre> \n+MGLMTRIADKTGALGSVVSAMGCAACFPALASFGAAIGLGFLSQYEGLFISRLLPLFAALA<br>FLANALGWFSHRQWLRSLLGMIGPAIVFAATVWLLGNWWTANLMYVGLALMIGVSIWDFV<br>SPAHRRCGPDGCELPAKRL*\n+ </pre>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<p align="left">\n+<a href=http://pfam.xfam.org/family/PF03203> PF03203</a>MerC mercury resistance protein<br>MerC mercury resistance protein \n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<HR SIZE=3 WIDTH=80%>\n+<center><b>PROKKA_00005</b><br>\n+</center>\n+<td bgcolor="#czb9dz">\n+<pre> \n+MSTLKITGMTCDSCAVHVKDALEKVPGVQSADVSYAKGSAKLAIEVGTSPDALTAAVAGLG<br>YRATLADAPSVSTPGGLLDKMRDLLGRNDKTGSSGALHIAVIGSGGAAMAAALKAVEQGA<br>RVTLIERGTIGGTCVNVGCVPSKIMIRAAHIAHLRRESPFDGGIAATTPTIQRTALLAQQ<br>QARVDELRHAKYEGILEGNPAITVLHGSARFKDNRNLIVQLNDGGERVVAFDRCLIATGA<br>SPAVPPIPGLKDTPYWTSTEALVSETIPKRLAVIGSSVVALELAQAFARLGAKVTILARS<br>TLFFREDPAIGEAVTAAFRMEGIEVREHTQASQVAYINGEGDGEFVLTTAHGELRADKLL<br>VATGRAPNTRKLALDATGVTLTPQGAIVIDPGMRTSVEHIYAAGDCTDQPQFVYVAAAAG<br>TRAAINMTGGDAALNLTAMPAVVFTDPQVATVGYSEAEAHHDGIKTDSRTLTLDNVPRAL<br>ANFDTRGFIKLVVEEGSGRLIGVQAVAPEAGELIQTAALAIRNRMTVQELADQLFPYLTM<br>VEGLKLAAQTFNKDVKQLSCCAG*\n+ </pre>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<p align="left">\n+<a href=http://pfam.xfam.org/family/PF00403> PF00403</a>Heavy-metal'..b'br>LLVLAIRTPLIRKLKQEKPGELGEIARVLAENNINILVQYSDHANQLILITDNDSMAASV<br>TLPWAIK*\n+ </pre>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<p align="left">\n+<a href=http://pfam.xfam.org/family/PF01842> PF01842</a>ACT domain<br>This family of domains generally have a regulatory role. ACT domains are linked to a wide range of metabolic enzymes that are regulated by amino acid concentration. Pairs of ACT domains bind specifically to a particular amino acid leading to regulation of the linked enzyme.  The ACT domain is found in: D-3-phosphoglycerate dehydrogenase EC:1.1.1.95 Swiss:P08328, which is inhibited by serine  . Aspartokinase EC:2.7.2.4 Swiss:P53553, which is regulated by lysine. Acetolactate synthase small regulatory subunit Swiss:P00894, which is inhibited by valine. Phenylalanine-4-hydroxylase EC:1.14.16.1 Swiss:P00439, which is regulated by phenylalanine. Prephenate dehydrogenase EC:4.2.1.51 Swiss:P21203. formyltetrahydrofolate deformylase EC:3.5.1.10, Swiss:P37051, which is activated by methionine and inhibited by glycine. GTP pyrophosphokinase EC:2.7.6.5 Swiss:P11585 \n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td bgcolor="#czb9dz">\n+<p align="left">\n+<a href=http://pfam.xfam.org/clan/CL0070> CL0070</a> These domains are involved in binding to amino-acids and causing allosteric regulation of linked enzyme domains  . The relationship between these two families was first noticed in  . \n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<HR SIZE=3 WIDTH=80%>\n+<center><b>PROKKA_00016</b><br>\n+</center>\n+<td bgcolor="#czb9dz">\n+<pre> \n+MSDISRVKILSALMDGRAWTATELSSVANISASTASSHLSKLLDCQLITVVAQGKHRYFRL<br>AGKDIAELMESMMGISLNHGVHARVSTPVHLRKARTCYDHLAGEVAVKIYDSLCQQQWIT<br>ENGSMITLSGIQYFHEMGIDVPSKHSRKICCACLDWSERRFHLGGYVGAALFSLYESKGW<br>LTRHLGYREVTITEKGYAAFKTHFHI*\n+ </pre>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<p align="left">\n+<a href=http://pfam.xfam.org/family/PF12840> PF12840</a>Helix-turn-helix domain<br>This domain represents a DNA-binding Helix-turn-helix domain found in transcriptional regulatory proteins. \n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td bgcolor="#czb9dz">\n+<p align="left">\n+<a href=http://pfam.xfam.org/clan/CL0123> CL0123</a> This family contains a diverse range of mostly DNA-binding domains that contain a helix-turn-helix motif. \n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<HR SIZE=3 WIDTH=80%>\n+<center><b>PROKKA_00017</b><br>\n+</center>\n+<td bgcolor="#czb9dz">\n+<pre> \n+MSRLDKSKVINSALELLNEVGIEGLTTRKLAQKLGVEQPTLYWHVKNKRALLDALAIEMLD<br>RHHTHFCPLEGESWQDFLRNNAKSFRCALLSHRDGAKVHLGTRPTEKQYETLENQLAFLC<br>QQGFSLENALYALSAVGHFTLGCVLEDQEHQVAKEERETPTTDSMPPLLRQAIELFDHQG<br>AEPAFLFGLELIICGLEKQLKCESGS*\n+ </pre>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<p align="left">\n+<a href=http://pfam.xfam.org/family/PF00440> PF00440</a>tetR; <br>Bacterial regulatory proteins, tetR family \n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td bgcolor="#czb9dz">\n+<p align="left">\n+<a href=http://pfam.xfam.org/clan/CL0123> CL0123</a> This family contains a diverse range of mostly DNA-binding domains that contain a helix-turn-helix motif. \n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td>\n+<p align="left">\n+<a href=http://pfam.xfam.org/family/PF02909> PF02909</a>tetR_C; <br>Tetracyclin repressor, C-terminal all-alpha domain \n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+<td bgcolor="#czb9dz">\n+<p align="left">\n+<a href=http://pfam.xfam.org/clan/CL0174> CL0174</a> TetR protein, C-terminal domain-like This clan features families of transcriptional regulators for multidrug efflux pumps, which belong to the TetR superfamily. They are induced by the presence of a variety of factors, such as antibiotics or organic solvents. The C-terminal region featured in these families is thought to contain the inducer-binding site; the divergent sequences in this region allow for the binding of a variety of different inducers [1-4]. \n+\n+</p>\n+</td>\n+<tr></tr>\n+<td></td>\n+<tr></tr>\n+</table>\n+</div>\n+</body>\n+</html>\n'
b
diff -r 000000000000 -r 68a3648c7d91 pfam_search/prova2
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pfam_search/prova2 Thu Dec 22 04:45:31 2016 -0500
b
@@ -0,0 +1,148 @@
+<html>
+<head></head>
+<body>
+Proteins with PFAM domains:
+<br><br>
+<div>
+<table cellpadding="0" width=650>
+<td>
+<HR SIZE=3 WIDTH=80%>
+<center><b>PROKKA_00001</b><br>
+</center>
+</td
+<tr></tr>
+<td bgcolor="#czb9dz">
+<pre> 
+MENNLENLTIGVFAKAAGVNVETIRFYQRKGLLREPDKPYGSIRRYGEADVVRVKFVKSA
+QRLGFSLDEIAELLRLDDGTHCEEASSLAEHKLKDVREKMADLARMETVLSELVCACHAR
+KGNVSCPLIASLQGEAGLARSAM
+
+ </pre>
+</td>
+<tr></tr>
+<td></td>
+<tr></tr>
+<td>
+<p align="left">
+<a href=http://pfam.xfam.org/family/PF09278> PF09278</a>
+<p align="justify">MerR, DNA binding<br>Members of this family of DNA-binding domains are predominantly found in the prokaryotic transcriptional regulator MerR. They adopt a structure consisting of a core of three alpha helices, with an architecture that is similar to that of the 'winged helix' fold  . </p>
+
+</p>
+</td>
+<tr></tr>
+<td></td>
+<tr></tr>
+<td bgcolor="#czb9dz">
+<p align="left">
+<a href=http://pfam.xfam.org/clan/CL0123> CL0123</a>
+ <p align="justify">This family contains a diverse range of mostly DNA-binding domains that contain a helix-turn-helix motif. </p>
+
+</p>
+</td>
+<tr></tr>
+<td></td>
+<tr></tr>
+<td>
+<HR SIZE=3 WIDTH=80%>
+<center><b>PROKKA_00009</b><br>
+</center>
+</td
+<tr></tr>
+<td bgcolor="#czb9dz">
+<pre> 
+MATDTPRIPEQGVATLPDEAWERARRRAEIISPLAQSETVGHEAADMAAQALGLSRRQVY
+VLIRRARQGSGLVTDLVPGQSGGGKGKGRLPEPVERVIHELLQKRFLTKQKRSLAAFHRE
+VTQVCKAQKLRVPARNTVALRIASLDPRKVIRRREGQDAARDLQGVGGEPPAVTAPLEQV
+QIDHTVIDLIVVDDRDRQPIGRPYLTLAIDVFTRCVLGMVVTLEAPSAVSVGLCLVHVAC
+DKRPWLEGLNVEMDWQMSGKPLLLYLDNAAEFKSEALRRGCEQHGIRLDYRPLGQPHYGG
+IVERIIGTAMQMIHDELPGTTFSNPDQRGDYDSENKAALTLRELERWLTLAVGTYHGSVH
+NGLLQPPAARWAEAVARVGVPAVVTRATSFLVDFLPILRRTLTRTGFVIDHIHYYADGHC
+C
+
+ </pre>
+</td>
+<tr></tr>
+<td></td>
+<tr></tr>
+<td>
+<p align="left">
+<a href=http://pfam.xfam.org/family/PF13518> PF13518</a>
+<p align="justify">Helix-turn-helix domain<br>This helix-turn-helix domain is often found in transposases and is likely to be DNA-binding. </p>
+
+</p>
+</td>
+<tr></tr>
+<td></td>
+<tr></tr>
+<td bgcolor="#czb9dz">
+<p align="left">
+<a href=http://pfam.xfam.org/clan/CL0123> CL0123</a>
+ <p align="justify">This family contains a diverse range of mostly DNA-binding domains that contain a helix-turn-helix motif. </p>
+
+</p>
+</td>
+<tr></tr>
+<td></td>
+<tr></tr>
+<td>
+<p align="left">
+<a href=http://pfam.xfam.org/family/PF00665> PF00665</a>
+<p align="justify">Integrase core domain<br>Integrase mediates integration of a DNA copy of the viral genome into the host chromosome.  Integrase is composed of three domains.  The amino-terminal domain is a zinc binding domain Pfam:PF02022.  This domain is the central catalytic domain. The carboxyl terminal domain that is a non-specific DNA binding domain Pfam:PF00552. The catalytic domain acts as an endonuclease when two nucleotides are removed from the 3' ends of the blunt-ended viral DNA made by reverse transcription. This domain also catalyses the DNA strand transfer reaction of the 3' ends of the viral DNA to the 5' ends of the integration site  . </p>
+
+</p>
+</td>
+<tr></tr>
+<td></td>
+<tr></tr>
+<td bgcolor="#czb9dz">
+<p align="left">
+<a href=http://pfam.xfam.org/clan/CL0219> CL0219</a>
+ <p align="justify">Ribonuclease H-like superfamily This clan includes a diverse set of nucleases that share a similar structure to Ribonuclease H. </p>
+
+</p>
+</td>
+<tr></tr>
+<td></td>
+<tr></tr>
+<td>
+<HR SIZE=3 WIDTH=80%>
+<center><b>PROKKA_00016</b><br>
+</center>
+</td
+<tr></tr>
+<td bgcolor="#czb9dz">
+<pre> 
+MSDISRVKILSALMDGRAWTATELSSVANISASTASSHLSKLLDCQLITVVAQGKHRYFR
+LAGKDIAELMESMMGISLNHGVHARVSTPVHLRKARTCYDHLAGEVAVKIYDSLCQQQWI
+TENGSMITLSGIQYFHEMGIDVPSKHSRKICCACLDWSERRFHLGGYVGAALFSLYESKG
+WLTRHLGYREVTITEKGYAAFKTHFH
+
+ </pre>
+</td>
+<tr></tr>
+<td></td>
+<tr></tr>
+<td>
+<p align="left">
+<a href=http://pfam.xfam.org/family/PF12840> PF12840</a>
+<p align="justify">Helix-turn-helix domain<br>This domain represents a DNA-binding Helix-turn-helix domain found in transcriptional regulatory proteins. </p>
+
+</p>
+</td>
+<tr></tr>
+<td></td>
+<tr></tr>
+<td bgcolor="#czb9dz">
+<p align="left">
+<a href=http://pfam.xfam.org/clan/CL0123> CL0123</a>
+ <p align="justify">This family contains a diverse range of mostly DNA-binding domains that contain a helix-turn-helix motif. </p>
+
+</p>
+</td>
+<tr></tr>
+<td></td>
+<tr></tr>
+</table>
+</div>
+</body>
+</html>
b
diff -r 000000000000 -r 68a3648c7d91 pfam_search/search
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pfam_search/search Thu Dec 22 04:45:31 2016 -0500
b
@@ -0,0 +1,3 @@
+MerR
+family
+regulatory
b
diff -r 000000000000 -r 68a3648c7d91 pfam_search/table
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pfam_search/table Thu Dec 22 04:45:31 2016 -0500
b
@@ -0,0 +1,51 @@
+# pfam_scan.pl,  run at Fri Jun 19 13:56:11 2015
+#
+# Copyright (c) 2009 Genome Research Ltd
+# Freely distributed under the GNU 
+# General Public License
+#
+# Authors: Jaina Mistry (jaina@ebi.ac.uk), 
+#          Rob Finn (rdf@ebi.ac.uk)
+#
+# This is free software; you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free Software
+# Foundation; either version 2 of the License, or (at your option) any later version.
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+# details.
+#
+# You should have received a copy of the GNU General Public License along with
+# this program. If not, see <http://www.gnu.org/licenses/>. 
+# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
+#      query sequence file: /home/inmare/galaxy/database/files/000/dataset_62.dat
+#     cpu number specified: 2
+#        searching against: /home/inmare/galaxy/tools/pfamScan/hmm/Pfam-A.hmm, with cut off --cut_ga
+#    resolve clan overlaps: on
+#     predict active sites: off
+# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
+#
+# <seq id> <alignment start> <alignment end> <envelope start> <envelope end> <hmm acc> <hmm name> <type> <hmm start> <hmm end> <hmm length> <bit score> <E-value> <significance> <clan>
+
+PROKKA_00001      9     46      9     46 PF00376.18  MerR              Family     1    38    38     43.1   2.2e-11   1 CL0123   
+PROKKA_00001     51    113     51    113 PF09278.6   MerR-DNA-bind     Domain     1    65    65     67.9   7.2e-19   1 CL0123   
+PROKKA_00002      1    116      1    116 PF02411.10  MerT              Family     1   116   116    214.7   1.7e-64   1 No_clan  
+PROKKA_00003     25     85     25     86 PF00403.21  HMA               Domain     1    61    62     60.3   1.4e-16   1 No_clan  
+PROKKA_00004      8    121      8    123 PF03203.9   MerC              Family     1   114   116     81.1   7.4e-23   1 No_clan  
+PROKKA_00005      3     62      3     63 PF00403.21  HMA               Domain     1    61    62     37.8   1.5e-09   1 No_clan  
+PROKKA_00005    100    410     99    412 PF07992.9   Pyr_redox_2       Domain     2   199   201    140.8   4.8e-41   1 CL0063   
+PROKKA_00005    271    345    271    354 PF00070.22  Pyr_redox         Domain     1    74    80     51.6   9.3e-14   1 CL0063   
+PROKKA_00005    440    548    440    549 PF02852.17  Pyr_redox_dim     Domain     1   109   110    104.5   2.9e-30   1 No_clan  
+PROKKA_00006      4     72      4     73 PF13411.1   MerR_1            Family     1    68    69     44.0   1.3e-11   1 CL0123   
+PROKKA_00007      2     74      1     75 PF05052.7   MerE              Family     2    74    75    142.4     3e-42   1 No_clan  
+PROKKA_00008     13    218     11    219 PF00563.15  EAL               Domain     3   209   236    169.6   7.3e-50   1 No_clan  
+PROKKA_00009     26     78     26     87 PF13518.1   HTH_28            Domain     1    46    52     26.9   3.4e-06   1 CL0123   
+PROKKA_00009    173    313    172    313 PF00665.21  rve               Domain     2   120   120     85.4   2.9e-24   1 CL0219   
+PROKKA_00010     75    212     73    213 PF13610.1   DDE_Tnp_IS240     Domain     4   139   140    170.7   1.6e-50   1 CL0219   
+PROKKA_00011      3     40      3     41 PF00126.22  HTH_1             Domain     1    38    60     48.8   4.1e-13   1 CL0123   
+PROKKA_00013      2    366      1    366 PF03616.9   Glt_symporter     Family     2   368   368    544.8  6.1e-164   1 CL0064   
+PROKKA_00014      1     76      1     77 PF03992.11  ABM               Domain     1    77    78     48.0   8.8e-13   1 CL0032   
+PROKKA_00015     78     98     74    115 PF01842.20  ACT               Domain     9    29    66     20.5   0.00024   1 CL0070   
+PROKKA_00016      5     55      1     55 PF12840.2   HTH_20            Domain    11    61    61     35.6   5.6e-09   1 CL0123   
+PROKKA_00017     10     55      9     55 PF00440.18  TetR_N            Domain     2    47    47     53.5   1.3e-14   1 CL0123   
+PROKKA_00017     68    201     68    201 PF02909.12  TetR_C            Domain     1   139   139    165.1   6.5e-49   1 CL0174