0
+ − 1 #!/usr/bin/perl -w
+ − 2
+ − 3 =head1 SYNOPSIS
+ − 4
+ − 5 micomplete.pl - Given a series of HMM profiles and a query proteome, and optionally, weights for the profiles, returns the completeness of a genome.
+ − 6
+ − 7 =head1 USAGE
+ − 8
+ − 9 micomplete.pl -h|--hmm_profiles -p|--proteome [-w|--weights] [-e|--evalue] [-s|--save-result-tab] [-r|--save-hmm-results] [-t|--threads]
+ − 10 micomplete.pl --help
+ − 11
+ − 12 =head1 INPUT
+ − 13
+ − 14 =head2 -h|--hmm
+ − 15
+ − 16 A single file containing the HMM profiles against which the proteome will be run.
+ − 17
+ − 18 =head2 -p|--proteome
+ − 19
+ − 20 A fasta file containing all protein sequences in a genome.
+ − 21
+ − 22 =head2 [-c|--cut-off]
+ − 23
+ − 24 The evalue cut-off to accept the presence of a HMM. 1e-10 by default.
+ − 25
+ − 26 =head2 [-w|--weights]
+ − 27
+ − 28 Optionally, the weights to apply to the HMMs, for the completeness report. A tab file containing (i) a column with names corresponding to the names in the hmm profiles and (ii) a column with with the weighths.
+ − 29
+ − 30 =head2 [-e|--evalue]
+ − 31
+ − 32 Shows evalue to the best hit, instead of presence/absence.
+ − 33
+ − 34 =head2 [-s|--save-result-tab]
+ − 35
+ − 36 Save the result per marker
+ − 37
+ − 38 =head2 [-r|--save-hmm-tab]
+ − 39
+ − 40 Save hmmsearch tabular result for this file.
+ − 41
+ − 42 =head2 [-a|--save-hmm-alignments]
+ − 43
+ − 44 Save hmmsearch tabular result for this file.
+ − 45
+ − 46 =head2 [-t|--threads]
+ − 47
+ − 48 Number of threads to use in the hmmsearch. 15 by default.
+ − 49
+ − 50 =head2 --help
+ − 51
+ − 52 Displays a help message.
+ − 53
+ − 54 =head1 OUTPUT
+ − 55
+ − 56 A completeness report expressed in fraction, and a list of all hmms identified as positive
+ − 57
+ − 58 =head1 AUTHOR
+ − 59
+ − 60 Lionel Guy (lionel.guy@icm.uu.se)
+ − 61
+ − 62 =head1 DATE
+ − 63
+ − 64 Mon Oct 14 14:09:46 CEST 2013
+ − 65
+ − 66 =cut
+ − 67
+ − 68 # libraries
+ − 69 use strict;
+ − 70 use Pod::Usage;
+ − 71 use Getopt::Long;
+ − 72 # For safe use of tempfiles
+ − 73 use File::Temp qw/ tempfile tempdir /;
+ − 74 use File::Copy qw/ cp /;
+ − 75
+ − 76 # Options
+ − 77 my ($hmm_file, $fasta_in, $weights_file, $display_eval);
+ − 78 my $cutoff = 1e-10;
+ − 79 my ($save_tab_file, $save_aln_file, $save_res_tab);
+ − 80 my $nthreads = 15;
+ − 81 my $help;
+ − 82 GetOptions(
+ − 83 'h|hmm=s' => \$hmm_file,
+ − 84 'p|proteome=s' => \$fasta_in,
+ − 85 'c|cut-off=s' => \$cutoff,
+ − 86 'w|weights=s' => \$weights_file,
+ − 87 'e|evalue' => \$display_eval,
+ − 88 'r|save-hmm-tab=s' => \$save_tab_file,
+ − 89 'a|save-hmm-alignments=s' => \$save_aln_file,
+ − 90 's|save-result-tab=s' => \$save_res_tab,
+ − 91 't|threads=s' => \$nthreads,
+ − 92 'help' => \$help,
+ − 93 );
+ − 94
+ − 95 # Tempfile if not saving results
+ − 96 my $template = 'tempXXXXXXXX';
+ − 97 my ($fh_tab, $tab_file) = tempfile($template, DIR => '.', UNLINK => 1);
+ − 98 my ($fh_aln, $aln_file) = tempfile($template, DIR => '.', UNLINK => 1);
+ − 99
+ − 100 # Check options
+ − 101 pod2usage(-verbose => 1) if $help;
+ − 102 pod2usage(-message => "No hmm profile file (-h|--hmm) defined\n",
+ − 103 -exitstatus => 2 ) unless ($hmm_file);
+ − 104 pod2usage(-message => "No proteome file (-p|--proteome) defined\n",
+ − 105 -exitstatus => 2 ) unless ($fasta_in);
+ − 106
+ − 107 # Parse eventual weights
+ − 108 my %hmms;
+ − 109 if ($weights_file){
+ − 110 open WEIGHTS, '<', $weights_file or die "Cannot open $weights_file\n";
+ − 111 while (<WEIGHTS>){
+ − 112 next if /^#/;
+ − 113 my ($name, $weight) = split;
+ − 114 $hmms{$name} = $weight;
+ − 115 }
+ − 116 }
+ − 117
+ − 118 # Parse HMM, lists names
+ − 119 open HMMS, '<', $hmm_file or die "Cannot open $hmm_file\n";
+ − 120 while (<HMMS>){
+ − 121 next unless /^NAME/;
+ − 122 my ($bla, $name) = split;
+ − 123 if ($weights_file) {
+ − 124 die "HMM $name doesn't have weight defined" unless $hmms{$name};
+ − 125 } else {
+ − 126 $hmms{$name}++;
+ − 127 }
+ − 128 }
+ − 129
+ − 130 # Get sum of weights
+ − 131 my $total;
+ − 132 foreach (values %hmms) {
+ − 133 $total += $_;
+ − 134 }
+ − 135 #print STDERR "Sum of weights: $total\n";
+ − 136
+ − 137 # Run hmmsearch
+ − 138 my $cmd = "hmmsearch --tblout $tab_file --cpu $nthreads ";
+ − 139 $cmd .= "--noali " unless $save_aln_file;
+ − 140 $cmd .= "-E $cutoff $hmm_file $fasta_in > $aln_file";
+ − 141 system($cmd) == 0 or die ("Command $cmd failed: $?");
+ − 142
+ − 143 # Read HMMER output. Assumes the first hit is always the best
+ − 144 open HMMER, '<', $tab_file or die $!;
+ − 145 my %seen;
+ − 146 my $sum;
+ − 147 while (<HMMER>){
+ − 148 next if /^#/;
+ − 149 my ($gid, $dash, $hname, $hid, $evalue) = split;
+ − 150 #my ($bla, $gi, $ref, $pid) = split(/\|/, $gid);
+ − 151 die ("No gid at $_\n") unless $gid;
+ − 152 die ("HMM not seen in the hmm file") unless $hmms{$hname};
+ − 153 # Skip if not first hit, or if evalue under cut-off
+ − 154 unless ($seen{$hname} || $evalue > $cutoff){
+ − 155 $sum += $hmms{$hname};
+ − 156 $seen{$hname} = $evalue;
+ − 157 }
+ − 158 }
+ − 159
+ − 160 # Save results if required
+ − 161 cp($tab_file, $save_tab_file) if ($save_tab_file);
+ − 162 cp($aln_file, $save_aln_file) if ($save_aln_file);
+ − 163
+ − 164 # Print results
+ − 165 printf "Completeness: %.4f", $sum/$total;
+ − 166 print "\nN markers found: ", scalar(keys %seen), " out of ",
+ − 167 scalar(keys %hmms), "\n";
+ − 168 if ($save_res_tab){
+ − 169 open TAB, '>', $save_res_tab or die "Could not open $save_res_tab: $?\n";
+ − 170 print TAB "#marker\t";
+ − 171 if ($display_eval) {
+ − 172 print TAB "evalue\n";
+ − 173 } else {
+ − 174 print TAB "present\n";
+ − 175 }
+ − 176 foreach my $hmm (sort keys %hmms){
+ − 177 print TAB "$hmm\t";
+ − 178 if ($seen{$hmm}){
+ − 179 if ($display_eval){
+ − 180 print TAB "$seen{$hmm}\n";
+ − 181 } else {
+ − 182 print TAB "1\n";
+ − 183 }
+ − 184 } else {
+ − 185 if ($display_eval){
+ − 186 print TAB "NA\n";
+ − 187 } else {
+ − 188 print TAB "0\n";
+ − 189 }
+ − 190 }
+ − 191 }
+ − 192 }
+ − 193