Mercurial > repos > hammock > hammock
diff external_tools/linux/lib/hh/scripts/splitfasta.pl @ 6:2277dd59b9f9 draft
Uploaded
author | hammock |
---|---|
date | Wed, 01 Nov 2017 05:54:28 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/external_tools/linux/lib/hh/scripts/splitfasta.pl Wed Nov 01 05:54:28 2017 -0400 @@ -0,0 +1,129 @@ +#! /usr/bin/env perl +# splitfasta.pl +# Split a file with multiple, FASTA formatted sequences into many single-sequence FASTA files +# +# (C) Johannes Soeding, 2012 +# +# HHsuite version 2.0.15 (June 2012) +# +# Reference: +# Remmert M., Biegert A., Hauser A., and Soding J. +# HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment. +# Nat. Methods, epub Dec 25, doi: 10.1038/NMETH.1818 (2011). + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +# We are very grateful for bug reports! Please contact us at soeding@genzentrum.lmu.de + +use lib $ENV{"HHLIB"}."/scripts"; +use HHPaths; # config file with path variables for nr, blast, psipred, pdb, dssp etc. +use strict; +use warnings; + +my $ext="seq"; +my $usage=" +splitfasta.pl from HHsuite $VERSION +Split a file with multiple, FASTA formatted sequences into multiple single-sequence FASTA files. +Write files into current directory and name each file by the first word after \">\" in the name line. + +Usage: splitfasta.pl infile [option] +Option: +-fam : use family-based name (for SCOP/ASTRAL sequences +-name : use sequence name as file name (default) +-ext <ext> : extension for sequence files (default=$ext) +\n"; + +if (@ARGV<1) {die $usage;;} + +my $line; +my $infile=$ARGV[0]; +my $outfile; +my $sequence=""; +my $options=""; +my $fam=0; # option -fam? +my $famid=""; +my %numfams=(); +my $n=0; # number of name lines read in so far + +if (@ARGV>1) { + $options.=join(" ",@ARGV[1..$#ARGV]); +} + +# Set number of cpus to use +if ($options=~s/-fam//g) {$fam=1;} +if ($options=~s/-name//g) {$fam=0;} +if ($options=~s/-ext\s+(\S+)//g) {$ext=$1;} + + +open (INFILE,"<$infile") || die("ERROR: Can't open $infile: $!\n"); + +if ($fam) { + + while ($line=<INFILE>) { + if ($line=~/^>(\S+)\s+(\S+)/) { + $famid=$2; + if ($n) { + open (OUTFILE,">$outfile") || die("ERROR: Can't open $outfile: $!\n"); + print(OUTFILE $sequence); + close(OUTFILE); + } + if (defined $numfams{$fam}) {$numfams{$fam}++;} else {$numfams{$fam}=1}; + $outfile="$fam.".$numfams{$fam}.".seq"; + $sequence=$line; + $n++; + } else { + $sequence.=$line; + } + } + if ($n) { + open (OUTFILE,">$outfile") || die("ERROR: Can't open $outfile: $!\n"); + print(OUTFILE $sequence); + close(OUTFILE); + } + +} else { + + my %exists=(); + while ($line=<INFILE>) { + if ($line=~/^>(\S+)/) { + if ($n) { + open (OUTFILE,">$outfile") || die("ERROR: Can't open $outfile: $!\n"); + print(OUTFILE $sequence); + close(OUTFILE); + } + if ($exists{$1}) {print("\nWarning: id $1 appears more than once in $infile\n");} + $exists{$1}=1; + my $tmp = $1; + $tmp =~ s/\|/_/g; + $tmp =~ s/\./_/g; + $outfile="$tmp.seq"; + $sequence=$line; + $n++; + } else { + $sequence.=$line; + } + } + if ($n) { + open (OUTFILE,">$outfile") || die("ERROR: Can't open $outfile: $!\n"); + print(OUTFILE $sequence); + close(OUTFILE); + } +} + + +close(INFILE); +printf("Created %i sequence files\n",$n); + + +