Mercurial > repos > dereeper > pangenome_explorer
comparison COG/bac-genomics-scripts/cat_seq/cat_seq.pl @ 3:e42d30da7a74 draft
Uploaded
author | dereeper |
---|---|
date | Thu, 30 May 2024 11:52:25 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
2:97e4e3e818b6 | 3:e42d30da7a74 |
---|---|
1 #!/usr/bin/perl | |
2 | |
3 use warnings; | |
4 use strict; | |
5 use Bio::SeqIO; # bioperl module to handle sequence input/output | |
6 use Bio::Seq; # bioperl module to handle sequences with features | |
7 use Bio::SeqUtils; # bioperl module with additional methods (including features) for Bio::Seq objects | |
8 | |
9 my $usage = "\n". | |
10 "\t#################################################################\n". | |
11 "\t# $0 multi-seq_file [outfile-format] #\n". #$0 = program name | |
12 "\t# #\n". | |
13 "\t# The script merges RichSeq sequences (embl or genbank, but #\n". | |
14 "\t# also fasta) in a multi-sequence file to one artificial #\n". | |
15 "\t# sequence. The first sequence in the file is used as a #\n". | |
16 "\t# foundation to add the subsequent sequences (along with #\n". | |
17 "\t# features and annotations). Optionally, a different output #\n". | |
18 "\t# file format can be specified (fasta/embl/genbank). #\n". | |
19 "\t# The script uses bioperl (www.bioperl.org). #\n". | |
20 "\t# #\n". | |
21 "\t# Adjust unix loop to run the script with all multi-seq files #\n". | |
22 "\t# in the current working directory, e.g.: #\n". | |
23 "\t# for i in *.embl; do cat_seq.pl \$i genbank; done #\n". | |
24 "\t# #\n". | |
25 "\t# version 0.1 A Leimbach #\n". | |
26 "\t# 08.02.2013 aleimba[at]gmx[dot]de #\n". | |
27 "\t#################################################################\n\n"; | |
28 | |
29 ### Shift arguments from @ARGV or give usage | |
30 my $multi_seq = shift or die $usage; | |
31 my $format = shift; | |
32 if ($multi_seq =~/-h/) { | |
33 die $usage; | |
34 } | |
35 | |
36 | |
37 ### Bio::SeqIO/Seq objects to concat the seqs | |
38 print "\nConcatenating multi-sequence file \"$multi_seq\" to an artificial sequence file ...\n"; | |
39 my $seqin = Bio::SeqIO->new(-file => "<$multi_seq"); # Bio::SeqIO object; no '-format' given, leave it to bioperl guessing | |
40 my @seqs; # store Bio::Seq objects for each seq in the multi-seq file | |
41 while (my $seq = $seqin->next_seq) { # Bio::Seq object | |
42 push(@seqs, $seq); | |
43 } | |
44 Bio::SeqUtils->cat(@seqs); | |
45 my $cat_seq = shift @seqs; # the first sequence in the array ($seqs[0]) was modified! | |
46 | |
47 | |
48 ### Write the artificial/concatenated sequence (with its features) to output Bio::SeqIO object | |
49 my $seqout; # Bio::SeqIO object | |
50 if ($format) { # true if defined | |
51 $multi_seq =~ s/^(.+)\.\w+$/$1_artificial\.$format/; | |
52 $seqout = Bio::SeqIO->new(-file => ">$multi_seq", -format => "$format"); | |
53 } else { | |
54 $multi_seq =~ s/^(.+)(\.\w+)$/$1_artificial$2/; | |
55 $seqout = Bio::SeqIO->new(-file => ">$multi_seq"); | |
56 } | |
57 $seqout->write_seq($cat_seq); | |
58 print "Created new file \"$multi_seq\"!\n\n"; | |
59 | |
60 exit; |