6
|
1 #! /usr/bin/env perl
|
|
2 # splitfasta.pl
|
|
3 # Split a file with multiple, FASTA formatted sequences into many single-sequence FASTA files
|
|
4 #
|
|
5 # (C) Johannes Soeding, 2012
|
|
6 #
|
|
7 # HHsuite version 2.0.15 (June 2012)
|
|
8 #
|
|
9 # Reference:
|
|
10 # Remmert M., Biegert A., Hauser A., and Soding J.
|
|
11 # HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment.
|
|
12 # Nat. Methods, epub Dec 25, doi: 10.1038/NMETH.1818 (2011).
|
|
13
|
|
14 # This program is free software: you can redistribute it and/or modify
|
|
15 # it under the terms of the GNU General Public License as published by
|
|
16 # the Free Software Foundation, either version 3 of the License, or
|
|
17 # (at your option) any later version.
|
|
18
|
|
19 # This program is distributed in the hope that it will be useful,
|
|
20 # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
21 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
22 # GNU General Public License for more details.
|
|
23
|
|
24 # You should have received a copy of the GNU General Public License
|
|
25 # along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
26
|
|
27 # We are very grateful for bug reports! Please contact us at soeding@genzentrum.lmu.de
|
|
28
|
|
29 use lib $ENV{"HHLIB"}."/scripts";
|
|
30 use HHPaths; # config file with path variables for nr, blast, psipred, pdb, dssp etc.
|
|
31 use strict;
|
|
32 use warnings;
|
|
33
|
|
34 my $ext="seq";
|
|
35 my $usage="
|
|
36 splitfasta.pl from HHsuite $VERSION
|
|
37 Split a file with multiple, FASTA formatted sequences into multiple single-sequence FASTA files.
|
|
38 Write files into current directory and name each file by the first word after \">\" in the name line.
|
|
39
|
|
40 Usage: splitfasta.pl infile [option]
|
|
41 Option:
|
|
42 -fam : use family-based name (for SCOP/ASTRAL sequences
|
|
43 -name : use sequence name as file name (default)
|
|
44 -ext <ext> : extension for sequence files (default=$ext)
|
|
45 \n";
|
|
46
|
|
47 if (@ARGV<1) {die $usage;;}
|
|
48
|
|
49 my $line;
|
|
50 my $infile=$ARGV[0];
|
|
51 my $outfile;
|
|
52 my $sequence="";
|
|
53 my $options="";
|
|
54 my $fam=0; # option -fam?
|
|
55 my $famid="";
|
|
56 my %numfams=();
|
|
57 my $n=0; # number of name lines read in so far
|
|
58
|
|
59 if (@ARGV>1) {
|
|
60 $options.=join(" ",@ARGV[1..$#ARGV]);
|
|
61 }
|
|
62
|
|
63 # Set number of cpus to use
|
|
64 if ($options=~s/-fam//g) {$fam=1;}
|
|
65 if ($options=~s/-name//g) {$fam=0;}
|
|
66 if ($options=~s/-ext\s+(\S+)//g) {$ext=$1;}
|
|
67
|
|
68
|
|
69 open (INFILE,"<$infile") || die("ERROR: Can't open $infile: $!\n");
|
|
70
|
|
71 if ($fam) {
|
|
72
|
|
73 while ($line=<INFILE>) {
|
|
74 if ($line=~/^>(\S+)\s+(\S+)/) {
|
|
75 $famid=$2;
|
|
76 if ($n) {
|
|
77 open (OUTFILE,">$outfile") || die("ERROR: Can't open $outfile: $!\n");
|
|
78 print(OUTFILE $sequence);
|
|
79 close(OUTFILE);
|
|
80 }
|
|
81 if (defined $numfams{$fam}) {$numfams{$fam}++;} else {$numfams{$fam}=1};
|
|
82 $outfile="$fam.".$numfams{$fam}.".seq";
|
|
83 $sequence=$line;
|
|
84 $n++;
|
|
85 } else {
|
|
86 $sequence.=$line;
|
|
87 }
|
|
88 }
|
|
89 if ($n) {
|
|
90 open (OUTFILE,">$outfile") || die("ERROR: Can't open $outfile: $!\n");
|
|
91 print(OUTFILE $sequence);
|
|
92 close(OUTFILE);
|
|
93 }
|
|
94
|
|
95 } else {
|
|
96
|
|
97 my %exists=();
|
|
98 while ($line=<INFILE>) {
|
|
99 if ($line=~/^>(\S+)/) {
|
|
100 if ($n) {
|
|
101 open (OUTFILE,">$outfile") || die("ERROR: Can't open $outfile: $!\n");
|
|
102 print(OUTFILE $sequence);
|
|
103 close(OUTFILE);
|
|
104 }
|
|
105 if ($exists{$1}) {print("\nWarning: id $1 appears more than once in $infile\n");}
|
|
106 $exists{$1}=1;
|
|
107 my $tmp = $1;
|
|
108 $tmp =~ s/\|/_/g;
|
|
109 $tmp =~ s/\./_/g;
|
|
110 $outfile="$tmp.seq";
|
|
111 $sequence=$line;
|
|
112 $n++;
|
|
113 } else {
|
|
114 $sequence.=$line;
|
|
115 }
|
|
116 }
|
|
117 if ($n) {
|
|
118 open (OUTFILE,">$outfile") || die("ERROR: Can't open $outfile: $!\n");
|
|
119 print(OUTFILE $sequence);
|
|
120 close(OUTFILE);
|
|
121 }
|
|
122 }
|
|
123
|
|
124
|
|
125 close(INFILE);
|
|
126 printf("Created %i sequence files\n",$n);
|
|
127
|
|
128
|
|
129
|