annotate external_tools/linux/lib/hh/scripts/splitfasta.pl @ 6:2277dd59b9f9 draft

Uploaded
author hammock
date Wed, 01 Nov 2017 05:54:28 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
6
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
1 #! /usr/bin/env perl
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
2 # splitfasta.pl
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
3 # Split a file with multiple, FASTA formatted sequences into many single-sequence FASTA files
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
4 #
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
5 # (C) Johannes Soeding, 2012
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
6 #
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
7 # HHsuite version 2.0.15 (June 2012)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
8 #
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
9 # Reference:
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
10 # Remmert M., Biegert A., Hauser A., and Soding J.
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
11 # HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment.
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
12 # Nat. Methods, epub Dec 25, doi: 10.1038/NMETH.1818 (2011).
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
13
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
14 # This program is free software: you can redistribute it and/or modify
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
15 # it under the terms of the GNU General Public License as published by
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
16 # the Free Software Foundation, either version 3 of the License, or
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
17 # (at your option) any later version.
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
18
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
19 # This program is distributed in the hope that it will be useful,
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
20 # but WITHOUT ANY WARRANTY; without even the implied warranty of
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
21 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
22 # GNU General Public License for more details.
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
23
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
24 # You should have received a copy of the GNU General Public License
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
25 # along with this program. If not, see <http://www.gnu.org/licenses/>.
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
26
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
27 # We are very grateful for bug reports! Please contact us at soeding@genzentrum.lmu.de
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
28
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
29 use lib $ENV{"HHLIB"}."/scripts";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
30 use HHPaths; # config file with path variables for nr, blast, psipred, pdb, dssp etc.
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
31 use strict;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
32 use warnings;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
33
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
34 my $ext="seq";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
35 my $usage="
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
36 splitfasta.pl from HHsuite $VERSION
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
37 Split a file with multiple, FASTA formatted sequences into multiple single-sequence FASTA files.
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
38 Write files into current directory and name each file by the first word after \">\" in the name line.
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
39
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
40 Usage: splitfasta.pl infile [option]
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
41 Option:
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
42 -fam : use family-based name (for SCOP/ASTRAL sequences
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
43 -name : use sequence name as file name (default)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
44 -ext <ext> : extension for sequence files (default=$ext)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
45 \n";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
46
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
47 if (@ARGV<1) {die $usage;;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
48
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
49 my $line;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
50 my $infile=$ARGV[0];
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
51 my $outfile;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
52 my $sequence="";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
53 my $options="";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
54 my $fam=0; # option -fam?
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
55 my $famid="";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
56 my %numfams=();
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
57 my $n=0; # number of name lines read in so far
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
58
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
59 if (@ARGV>1) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
60 $options.=join(" ",@ARGV[1..$#ARGV]);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
61 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
62
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
63 # Set number of cpus to use
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
64 if ($options=~s/-fam//g) {$fam=1;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
65 if ($options=~s/-name//g) {$fam=0;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
66 if ($options=~s/-ext\s+(\S+)//g) {$ext=$1;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
67
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
68
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
69 open (INFILE,"<$infile") || die("ERROR: Can't open $infile: $!\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
70
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
71 if ($fam) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
72
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
73 while ($line=<INFILE>) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
74 if ($line=~/^>(\S+)\s+(\S+)/) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
75 $famid=$2;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
76 if ($n) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
77 open (OUTFILE,">$outfile") || die("ERROR: Can't open $outfile: $!\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
78 print(OUTFILE $sequence);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
79 close(OUTFILE);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
80 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
81 if (defined $numfams{$fam}) {$numfams{$fam}++;} else {$numfams{$fam}=1};
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
82 $outfile="$fam.".$numfams{$fam}.".seq";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
83 $sequence=$line;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
84 $n++;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
85 } else {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
86 $sequence.=$line;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
87 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
88 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
89 if ($n) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
90 open (OUTFILE,">$outfile") || die("ERROR: Can't open $outfile: $!\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
91 print(OUTFILE $sequence);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
92 close(OUTFILE);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
93 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
94
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
95 } else {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
96
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
97 my %exists=();
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
98 while ($line=<INFILE>) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
99 if ($line=~/^>(\S+)/) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
100 if ($n) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
101 open (OUTFILE,">$outfile") || die("ERROR: Can't open $outfile: $!\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
102 print(OUTFILE $sequence);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
103 close(OUTFILE);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
104 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
105 if ($exists{$1}) {print("\nWarning: id $1 appears more than once in $infile\n");}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
106 $exists{$1}=1;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
107 my $tmp = $1;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
108 $tmp =~ s/\|/_/g;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
109 $tmp =~ s/\./_/g;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
110 $outfile="$tmp.seq";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
111 $sequence=$line;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
112 $n++;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
113 } else {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
114 $sequence.=$line;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
115 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
116 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
117 if ($n) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
118 open (OUTFILE,">$outfile") || die("ERROR: Can't open $outfile: $!\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
119 print(OUTFILE $sequence);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
120 close(OUTFILE);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
121 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
122 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
123
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
124
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
125 close(INFILE);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
126 printf("Created %i sequence files\n",$n);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
127
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
128
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
129