Mercurial > repos > cpt > cpt_psm_recombine
comparison cpt_psm_0_prep.pl @ 1:97ef96676b48 draft
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author | cpt |
---|---|
date | Mon, 05 Jun 2023 02:51:26 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:b18e8268bf4e | 1:97ef96676b48 |
---|---|
1 #!/usr/bin/env perl | |
2 use strict; | |
3 use warnings; | |
4 use Storable; | |
5 use Bio::SearchIO; | |
6 use Bio::SeqIO; | |
7 use Bio::Tools::CodonTable; | |
8 use Data::Dumper; | |
9 use File::Temp qw/tempfile tempdir/; | |
10 | |
11 | |
12 use CPT::GalaxyGetOpt; | |
13 my $ggo = CPT::GalaxyGetOpt->new(); | |
14 my $options = $ggo->getOptions( | |
15 'options' => [ | |
16 [ 'file', 'Input file', { validate => 'File/Input', multiple => 1, required => 1, | |
17 file_format => ['genbank', 'txt'], | |
18 } ], | |
19 ], | |
20 'outputs' => [ | |
21 [ | |
22 'cpt_psm_object', | |
23 'Output PSM Object', | |
24 { | |
25 validate => 'File/Output', | |
26 required => 1, | |
27 default => 'cpt_psm', | |
28 data_format => 'text/plain', | |
29 default_format => 'TXT' | |
30 } | |
31 ], | |
32 ], | |
33 'defaults' => [ | |
34 'appid' => 'PSM.Prep', | |
35 'appname' => 'PSM Prep', | |
36 'appdesc' => 'prepares data for the PSM Plotter', | |
37 'appvers' => '1.94.2', | |
38 ], | |
39 'tests' => [], | |
40 ); | |
41 | |
42 | |
43 | |
44 use CPT::Bio; | |
45 my $bio = CPT::Bio->new(); | |
46 | |
47 my @genbank_files = @{$options->{file}}; | |
48 | |
49 my %data = ( | |
50 file_list => [], | |
51 ); | |
52 | |
53 my $GLOBAL_UNLINK_VAR = 1; | |
54 my $tempdir = tempdir('cpt.psm2.XXXXXXX',CLEANUP => $GLOBAL_UNLINK_VAR); | |
55 use CPT::Util::CRC64; | |
56 my $crc = CPT::Util::CRC64->new(); | |
57 | |
58 foreach my $file(@genbank_files){ | |
59 my $seqio_object = Bio::SeqIO->new(-file => $file,-format=>'genbank'); | |
60 while(my $seqobj = $seqio_object->next_seq){ | |
61 my ( $fh, $path ) = tempfile('cds_export.XXXXXXXXX', UNLINK => $GLOBAL_UNLINK_VAR, DIR => $tempdir, SUFFIX => '.fa'); | |
62 | |
63 my @gi_array; | |
64 foreach my $feat ( $seqobj->get_SeqFeatures ) { | |
65 if($feat->primary_tag eq 'CDS'){ | |
66 my $header = $bio->_getIdentifier($feat); | |
67 # This ensures proteins have a file-specific ID appeneded to them. | |
68 my $seq = $bio->translate( | |
69 $bio->intelligent_get_seq($feat)); | |
70 | |
71 # Proteins come with translated stop codon | |
72 $seq =~ s/\*//g; | |
73 $seq =~ s/\+//g; | |
74 $seq =~ s/#//g; | |
75 $header .= "_" . $crc->crc64($seq); | |
76 push @gi_array, $header; | |
77 print $fh ">$header\n$seq\n"; | |
78 } | |
79 } | |
80 $data{gbk}{$seqobj->display_id()}{'gi'} = \@gi_array; | |
81 $data{gbk}{$seqobj->display_id()}{'fasta_location'} = $path; | |
82 $data{gbk}{$seqobj->display_id()}{'gbk_location'} = $file; | |
83 push(@{$data{file_list}}, $seqobj->display_id()); | |
84 close $fh; | |
85 } | |
86 } | |
87 | |
88 | |
89 | |
90 use IPC::Run3; | |
91 | |
92 # Concatenate Fasta Files | |
93 my @fasta_files; | |
94 foreach(@{$data{file_list}}){ | |
95 push(@fasta_files, $data{gbk}{$_}{fasta_location}); | |
96 } | |
97 my @command = ('cat', @fasta_files); | |
98 my ($merged_fa_fh, $merged_fa_path) = tempfile('merged.XXXXXXXXX', UNLINK => 1, DIR => $tempdir, SUFFIX => '.fa'); | |
99 my ($in, $out, $err); | |
100 run3 \@command, \$in, \$out, \$err; | |
101 if($err){ | |
102 print STDERR $err; | |
103 } | |
104 print $merged_fa_fh $out; | |
105 close($merged_fa_fh); | |
106 | |
107 | |
108 # Create Blast Database | |
109 my ($blastdb_fh, $blastdb_path) = tempfile('blastdb.XXXXXXXXX', UNLINK => 1, DIR => $tempdir); | |
110 @command = ('makeblastdb', | |
111 '-dbtype', 'prot', | |
112 '-in', $merged_fa_path, | |
113 '-out', $blastdb_path, | |
114 ); | |
115 my ($makeblast_in,$makeblast_out,$makeblast_err); | |
116 run3 \@command, \$makeblast_in, \$makeblast_out, \$makeblast_err; | |
117 | |
118 # Blast files | |
119 foreach(@{$data{file_list}}){ | |
120 #push(@fasta_files, $data{gbk}{$_}{fasta_location}); | |
121 my @blast_cmd = ('blastp', | |
122 '-query', $data{gbk}{$_}{fasta_location}, | |
123 '-out', $data{gbk}{$_}{fasta_location} . ".xml", | |
124 '-outfmt', '5', | |
125 '-db', $blastdb_path, | |
126 ); | |
127 my ($blast_in,$blast_out,$blast_err); | |
128 run3 \@blast_cmd, \$blast_in, \$blast_out, \$blast_err; | |
129 } | |
130 | |
131 my $value; | |
132 my @data_tsv; | |
133 | |
134 foreach(@{$data{file_list}}){ | |
135 #push(@fasta_files, $data{gbk}{$_}{fasta_location}); | |
136 my $file = $data{gbk}{$_}{fasta_location}; | |
137 | |
138 my $in = new Bio::SearchIO( | |
139 -format => 'blastxml', | |
140 -tempfile => 1, | |
141 -file => "$file.xml", | |
142 ); | |
143 while( my $result = $in->next_result ) { | |
144 while( my $hit = $result->next_hit ) { | |
145 while( my $hsp = $hit->next_hsp ) { | |
146 my $Identity = $hsp->percent_identity/100 * $hsp->length('query'); | |
147 my $IterationQueryLength = $result->query_length(); | |
148 my $HitLength = $hit->length(); | |
149 my $dice = (200*$Identity)/($HitLength + $IterationQueryLength); | |
150 | |
151 my $c = $result->query_description();#genome_a_header | |
152 my $d = $hit->name();#genome_b_header | |
153 # Skip self-self links | |
154 next if($c eq $d); | |
155 | |
156 push (@data_tsv, | |
157 [ | |
158 $c, | |
159 $d, | |
160 $hsp->evalue(), | |
161 $dice, | |
162 ] | |
163 ); | |
164 | |
165 }#close hsp | |
166 }#close hit | |
167 }#close result | |
168 }#close for genomes | |
169 | |
170 $data{hit_table} = \@data_tsv; | |
171 | |
172 use CPT::OutputFiles; | |
173 my $psmout = CPT::OutputFiles->new( | |
174 name => 'cpt_psm_object', | |
175 GGO => $ggo, | |
176 ); | |
177 my @val = $psmout->CRR(data => "none", extension => 'psm'); | |
178 store \%data,$val[0]; | |
179 | |
180 | |
181 | |
182 | |
183 | |
184 | |
185 | |
186 =head1 NAME | |
187 | |
188 PSM Prep | |
189 | |
190 =head1 DESCRIPTION | |
191 | |
192 This tool takes in 2 or more GenBank files, blasts, and prepares data structures for use in the companion tool: PSM Plotter. Select as many (multi)-gbk files as you I<might> want to plot. Once this tool is done, you can select any subset of those to plot then. | |
193 | |
194 =cut |