annotate COG/bac-genomics-scripts/cat_seq/cat_seq.pl @ 14:5a5c9a6b047b draft

Uploaded
author dereeper
date Tue, 10 Dec 2024 16:20:53 +0000
parents e42d30da7a74
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1 #!/usr/bin/perl
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
3 use warnings;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
4 use strict;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
5 use Bio::SeqIO; # bioperl module to handle sequence input/output
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
6 use Bio::Seq; # bioperl module to handle sequences with features
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
7 use Bio::SeqUtils; # bioperl module with additional methods (including features) for Bio::Seq objects
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
8
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
9 my $usage = "\n".
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
10 "\t#################################################################\n".
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
11 "\t# $0 multi-seq_file [outfile-format] #\n". #$0 = program name
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
12 "\t# #\n".
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
13 "\t# The script merges RichSeq sequences (embl or genbank, but #\n".
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
14 "\t# also fasta) in a multi-sequence file to one artificial #\n".
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
15 "\t# sequence. The first sequence in the file is used as a #\n".
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
16 "\t# foundation to add the subsequent sequences (along with #\n".
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
17 "\t# features and annotations). Optionally, a different output #\n".
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
18 "\t# file format can be specified (fasta/embl/genbank). #\n".
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
19 "\t# The script uses bioperl (www.bioperl.org). #\n".
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
20 "\t# #\n".
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
21 "\t# Adjust unix loop to run the script with all multi-seq files #\n".
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
22 "\t# in the current working directory, e.g.: #\n".
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
23 "\t# for i in *.embl; do cat_seq.pl \$i genbank; done #\n".
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
24 "\t# #\n".
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
25 "\t# version 0.1 A Leimbach #\n".
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
26 "\t# 08.02.2013 aleimba[at]gmx[dot]de #\n".
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
27 "\t#################################################################\n\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
28
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
29 ### Shift arguments from @ARGV or give usage
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
30 my $multi_seq = shift or die $usage;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
31 my $format = shift;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
32 if ($multi_seq =~/-h/) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
33 die $usage;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
34 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
35
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
36
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
37 ### Bio::SeqIO/Seq objects to concat the seqs
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
38 print "\nConcatenating multi-sequence file \"$multi_seq\" to an artificial sequence file ...\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
39 my $seqin = Bio::SeqIO->new(-file => "<$multi_seq"); # Bio::SeqIO object; no '-format' given, leave it to bioperl guessing
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
40 my @seqs; # store Bio::Seq objects for each seq in the multi-seq file
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
41 while (my $seq = $seqin->next_seq) { # Bio::Seq object
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
42 push(@seqs, $seq);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
43 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
44 Bio::SeqUtils->cat(@seqs);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
45 my $cat_seq = shift @seqs; # the first sequence in the array ($seqs[0]) was modified!
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
46
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
47
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
48 ### Write the artificial/concatenated sequence (with its features) to output Bio::SeqIO object
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
49 my $seqout; # Bio::SeqIO object
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
50 if ($format) { # true if defined
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
51 $multi_seq =~ s/^(.+)\.\w+$/$1_artificial\.$format/;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
52 $seqout = Bio::SeqIO->new(-file => ">$multi_seq", -format => "$format");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
53 } else {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
54 $multi_seq =~ s/^(.+)(\.\w+)$/$1_artificial$2/;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
55 $seqout = Bio::SeqIO->new(-file => ">$multi_seq");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
56 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
57 $seqout->write_seq($cat_seq);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
58 print "Created new file \"$multi_seq\"!\n\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
59
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
60 exit;