Mercurial > repos > hammock > hammock
comparison external_tools/linux/lib/hh/scripts/splitfasta.pl @ 6:2277dd59b9f9 draft
Uploaded
author | hammock |
---|---|
date | Wed, 01 Nov 2017 05:54:28 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
5:b7652b7c97bd | 6:2277dd59b9f9 |
---|---|
1 #! /usr/bin/env perl | |
2 # splitfasta.pl | |
3 # Split a file with multiple, FASTA formatted sequences into many single-sequence FASTA files | |
4 # | |
5 # (C) Johannes Soeding, 2012 | |
6 # | |
7 # HHsuite version 2.0.15 (June 2012) | |
8 # | |
9 # Reference: | |
10 # Remmert M., Biegert A., Hauser A., and Soding J. | |
11 # HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment. | |
12 # Nat. Methods, epub Dec 25, doi: 10.1038/NMETH.1818 (2011). | |
13 | |
14 # This program is free software: you can redistribute it and/or modify | |
15 # it under the terms of the GNU General Public License as published by | |
16 # the Free Software Foundation, either version 3 of the License, or | |
17 # (at your option) any later version. | |
18 | |
19 # This program is distributed in the hope that it will be useful, | |
20 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
21 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
22 # GNU General Public License for more details. | |
23 | |
24 # You should have received a copy of the GNU General Public License | |
25 # along with this program. If not, see <http://www.gnu.org/licenses/>. | |
26 | |
27 # We are very grateful for bug reports! Please contact us at soeding@genzentrum.lmu.de | |
28 | |
29 use lib $ENV{"HHLIB"}."/scripts"; | |
30 use HHPaths; # config file with path variables for nr, blast, psipred, pdb, dssp etc. | |
31 use strict; | |
32 use warnings; | |
33 | |
34 my $ext="seq"; | |
35 my $usage=" | |
36 splitfasta.pl from HHsuite $VERSION | |
37 Split a file with multiple, FASTA formatted sequences into multiple single-sequence FASTA files. | |
38 Write files into current directory and name each file by the first word after \">\" in the name line. | |
39 | |
40 Usage: splitfasta.pl infile [option] | |
41 Option: | |
42 -fam : use family-based name (for SCOP/ASTRAL sequences | |
43 -name : use sequence name as file name (default) | |
44 -ext <ext> : extension for sequence files (default=$ext) | |
45 \n"; | |
46 | |
47 if (@ARGV<1) {die $usage;;} | |
48 | |
49 my $line; | |
50 my $infile=$ARGV[0]; | |
51 my $outfile; | |
52 my $sequence=""; | |
53 my $options=""; | |
54 my $fam=0; # option -fam? | |
55 my $famid=""; | |
56 my %numfams=(); | |
57 my $n=0; # number of name lines read in so far | |
58 | |
59 if (@ARGV>1) { | |
60 $options.=join(" ",@ARGV[1..$#ARGV]); | |
61 } | |
62 | |
63 # Set number of cpus to use | |
64 if ($options=~s/-fam//g) {$fam=1;} | |
65 if ($options=~s/-name//g) {$fam=0;} | |
66 if ($options=~s/-ext\s+(\S+)//g) {$ext=$1;} | |
67 | |
68 | |
69 open (INFILE,"<$infile") || die("ERROR: Can't open $infile: $!\n"); | |
70 | |
71 if ($fam) { | |
72 | |
73 while ($line=<INFILE>) { | |
74 if ($line=~/^>(\S+)\s+(\S+)/) { | |
75 $famid=$2; | |
76 if ($n) { | |
77 open (OUTFILE,">$outfile") || die("ERROR: Can't open $outfile: $!\n"); | |
78 print(OUTFILE $sequence); | |
79 close(OUTFILE); | |
80 } | |
81 if (defined $numfams{$fam}) {$numfams{$fam}++;} else {$numfams{$fam}=1}; | |
82 $outfile="$fam.".$numfams{$fam}.".seq"; | |
83 $sequence=$line; | |
84 $n++; | |
85 } else { | |
86 $sequence.=$line; | |
87 } | |
88 } | |
89 if ($n) { | |
90 open (OUTFILE,">$outfile") || die("ERROR: Can't open $outfile: $!\n"); | |
91 print(OUTFILE $sequence); | |
92 close(OUTFILE); | |
93 } | |
94 | |
95 } else { | |
96 | |
97 my %exists=(); | |
98 while ($line=<INFILE>) { | |
99 if ($line=~/^>(\S+)/) { | |
100 if ($n) { | |
101 open (OUTFILE,">$outfile") || die("ERROR: Can't open $outfile: $!\n"); | |
102 print(OUTFILE $sequence); | |
103 close(OUTFILE); | |
104 } | |
105 if ($exists{$1}) {print("\nWarning: id $1 appears more than once in $infile\n");} | |
106 $exists{$1}=1; | |
107 my $tmp = $1; | |
108 $tmp =~ s/\|/_/g; | |
109 $tmp =~ s/\./_/g; | |
110 $outfile="$tmp.seq"; | |
111 $sequence=$line; | |
112 $n++; | |
113 } else { | |
114 $sequence.=$line; | |
115 } | |
116 } | |
117 if ($n) { | |
118 open (OUTFILE,">$outfile") || die("ERROR: Can't open $outfile: $!\n"); | |
119 print(OUTFILE $sequence); | |
120 close(OUTFILE); | |
121 } | |
122 } | |
123 | |
124 | |
125 close(INFILE); | |
126 printf("Created %i sequence files\n",$n); | |
127 | |
128 | |
129 |