Mercurial > repos > melpetera > acorf
diff ACF/Analytic_correlation_filtration.pl @ 0:d03fcbeb0a77 draft
Uploaded
author | melpetera |
---|---|
date | Fri, 18 Oct 2019 04:59:51 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ACF/Analytic_correlation_filtration.pl Fri Oct 18 04:59:51 2019 -0400 @@ -0,0 +1,643 @@ +#!usr/bin/perl + +### Perl modules +use warnings; +use strict; +use Getopt::Long qw(GetOptions); #Creation of script options +use Pod::Usage qw(pod2usage); #Creation of script options + +#Personnal packages +use FindBin ; ## Allows you to locate the directory of original perl script +#use lib $FindBin::Bin; +use lib "$FindBin::Bin/lib"; +use IonFiltration; + +my ($file, $mass_file, $opt, $dataMatrix, $combined_DMVM, $repres_opt, $rt_threshold, $mass_threshold, $output_sif, $output_tabular, $correl_threshold, $intensity_threshold, $intensity_pourc); #Options to complete + +######################## +### Options and help ### +######################## + +GetOptions("f=s"=>\$file, "m=s"=>\$mass_file, "o=s"=>\$opt, "d=s"=>\$dataMatrix, "v=s"=>\$combined_DMVM, "r=s"=>\$repres_opt, "rt=f"=>\$rt_threshold, "mass=f"=>\$mass_threshold, "output_sif=s"=>\$output_sif, "output_tabular=s"=>\$output_tabular, "correl=s"=>\$correl_threshold, "IT=f"=>\$intensity_threshold, "IP=f"=>\$intensity_pourc) or pod2usage(2); + +### Check required parameters : +pod2usage({-message=>q{Mandatory argument '-f' is missing}, -exitval=>1, -verbose=>0}) unless $file; +#pod2usage({-message=>q{Mandatory argument '-m' is missing}, -exitval=>1, -verbose=>0}) unless $mass_file; +pod2usage({-message=>q{Mandatory argument '-o' is missing. It correspond to the grouping method for analytical correlation groups formation. +#It should be a number (1 ; 2 or 3) : +# 1 : Don't take into acount mass information (only RT) ; +# 2 : Check that all mass differences are include in a specific list and taking into acount RT information +# 3 : Check that all mass differences are include in a specific list, ignoring RT information +#To use the tool without takinf into account mass and RT information, use option 1 and define the RT threshold to 999999999.}, -exitval=>1, -verbose=>0}) unless $opt; +pod2usage({-message=>q{Mandatory argument '-r' is missing. It correspond to the group representent choosing method for analytical correlation groups formation. +It should be one of the 3 options below : + "mass" : choose the ion with the highest mass as the representant + "intensity" : choose the ion with the highest intensity as the representant + "mixt" : choose the ion with the highest (mass^2 * intensity) as the representant + "max_intensity_max_mass" : choose tha ion witht he highest intenisty among the 5 most intense ions of the group}, -exitval=>1, -verbose=>0}) unless $repres_opt; +pod2usage({-message=>q{Mandatory argument '-d' is missing}, -exitval=>1, -verbose=>0}) unless $dataMatrix; +pod2usage({-message=>q{Mandatory argument '-v' is missing}, -exitval=>1, -verbose=>0}) unless $combined_DMVM; +#pod2usage({-message=>q{Mandatory argument '-rt' is missing}, -exitval=>1, -verbose=>0}) unless $rt_threshold; +#pod2usage({-message=>q{Mandatory argument '-mass' is missing}, -exitval=>1, -verbose=>0}) unless $mass_threshold; +pod2usage({-message=>q{Mandatory argument '-correl' is missing}, -exitval=>1, -verbose=>0}) unless $correl_threshold; +pod2usage({-message=>q{Mandatory argument '-output_tabular' is missing}, -exitval=>1, -verbose=>0}) unless $output_tabular; +pod2usage({-message=>q{Mandatory argument '-output_sif' is missing}, -exitval=>1, -verbose=>0}) unless $output_sif; + + +#if(($opt != 1) && ($opt != 2) && ($opt != 3)){ +# print "you must indicate \"1\", \"2\" or \"3\" for the --o otpion\n"; +# exit; +#} + + + +if(($repres_opt ne "mass") && ($repres_opt ne "intensity") && ($repres_opt ne "mixt") && ($repres_opt ne "max_intensity_max_mass")){ + print "you must indicate \"mass\", \"intensity\", \"mix\" or \"max_intensity_max_mass\" for the --r otpion\n"; + exit; +} + + + +######################################################################### +#### Création of a hash containing all adduits and fragments possible ### +######################################################################### + +my %hmass; +if($opt != 1){ + %hmass = IonFiltration::MassCollecting($mass_file); + +} + +my $refhmass = \%hmass; + +print "Création of a hash containing all adduits and fragments possible\n"; + + +######################################################## +### Creation of a sif table + correlation filtration ### +######################################################## + +my %hrtmz; +($output_sif, %hrtmz) = IonFiltration::sifTableCreation($file, $output_sif, $opt, $rt_threshold, $mass_threshold, $correl_threshold, $dataMatrix, $output_tabular, $combined_DMVM, $repres_opt, $intensity_threshold, $intensity_pourc, \%hmass); +print "Creation of a sif table + correlation filtration done\n"; + + +###################################################### +### Analytic correlation filtrering follow options ### +###################################################### + +my %hheader_file; +my %hduplicate; + +my %hcorrelgroup; +my $groupct=1; + +my $linenb3=0; +my %hheader_line; + + + +open (F1, $output_sif) or die "Impossible to open $output_sif\n"; + +while(my $line = <F1>){ + my $count=0; + chomp $line; + my @tline = split(/\t/, $line); + my $a = $tline[0]; + my $b = $tline[2]; + + my $amass=$hrtmz{$a}{mz}; + my $atemp=$hrtmz{$a}{rt}; + my $bmass= $hrtmz{$b}{mz}; + my $btemp=$hrtmz{$b}{rt}; + my $diff = $amass-$bmass; + $diff = abs($diff); + + ### Option 1: Don't take into acount mass information ### + + if($opt == 1){ + my $btplus = $btemp + $rt_threshold; + my $btmoins = $btemp - $rt_threshold; + if(($btmoins <= $atemp) && ($atemp <= $btplus)){ + foreach my $k (keys %hcorrelgroup){ + if((defined($hcorrelgroup{$k}{$a})) || (defined($hcorrelgroup{$k}{$b}))){ + $hcorrelgroup{$k}{$a}=1; + $hcorrelgroup{$k}{$b}=1; + $count++; + last; + } + } + if($count == 0){ + my $groupnb="group".$groupct; + $hcorrelgroup{$groupnb}{$a}=1; + $hcorrelgroup{$groupnb}{$b}=1; + $groupct ++; + } + } + } + + + + ### Option 2: Check that all mass differences are include in a specific list taking into account RT information ### + + elsif($opt == 2){ + + my $print = 0; + foreach my $s (keys %{$refhmass}){ + foreach my $r (keys %{$refhmass->{$s}}){ + my $rm = $r - $mass_threshold; + my $rp = $r + $mass_threshold; + if(($diff <= $rp) && ($diff >= $rm)){ + if($print == 0){ + my $btplus = $btemp + $rt_threshold; + my $btmoins = $btemp - $rt_threshold; + + if(($btmoins <= $atemp) && ($atemp <= $btplus)){ + foreach my $k (keys %hcorrelgroup){ + if((defined($hcorrelgroup{$k}{$a})) || (defined($hcorrelgroup{$k}{$b}))){ + $hcorrelgroup{$k}{$a}=1; + $hcorrelgroup{$k}{$b}=1; + $count++; + last; + } + } + if($count == 0){ + my $groupnb="group".$groupct; + $hcorrelgroup{$groupnb}{$a}=1; + $hcorrelgroup{$groupnb}{$b}=1; + $groupct ++; + } + $print = 1; + } + } + } + } + } + } + + + ### Option 3: Check that all mass differences are include in a specific list, ignoring RT information ### + + elsif($opt == 3){ + + my $print = 0; + foreach my $s (keys %{$refhmass}){ + foreach my $r (keys %{$refhmass->{$s}}){ + my $rm = $r - $mass_threshold; + my $rp = $r + $mass_threshold; + if(($diff <= $rp) && ($diff >= $rm)){ + if($print == 0){ + + foreach my $k (keys %hcorrelgroup){ + if((defined($hcorrelgroup{$k}{$a})) || (defined($hcorrelgroup{$k}{$b}))){ + $hcorrelgroup{$k}{$a}=1; + $hcorrelgroup{$k}{$b}=1; + $count++; + last; + } + } + if($count == 0){ + my $groupnb="group".$groupct; + $hcorrelgroup{$groupnb}{$a}=1; + $hcorrelgroup{$groupnb}{$b}=1; + $groupct ++; + } + $print = 1; + } + } + } + } + } +} +close F1; + +print "Analytic correlation filtrering follow options done\n"; + + +############################################# +### Join groups that have been subdivided ### +############################################# + +my @tdelete; + +foreach my $k (keys %hcorrelgroup){ + foreach my $i (keys %{$hcorrelgroup{$k}}){ + foreach my $v (keys %hcorrelgroup){ + my $count = 0; + if ($v ne $k){ + foreach my $w (keys %{$hcorrelgroup{$v}}){ + if($w eq $i){ + $count = 1; + push(@tdelete, $v); + } + } + } + if($count == 1){ + foreach my $w (keys %{$hcorrelgroup{$v}}){ + $hcorrelgroup{$k}{$w}=$hcorrelgroup{$v}{$w}; + } + delete($hcorrelgroup{$v}); + } + } + } +} + +foreach my $t (@tdelete){ + delete($hcorrelgroup{$t}); +} + + +### Do it twice to see if it fix the problem of unmerge groups + +foreach my $k (keys %hcorrelgroup){ + foreach my $i (keys %{$hcorrelgroup{$k}}){ + foreach my $v (keys %hcorrelgroup){ + my $count = 0; + if ($v ne $k){ + foreach my $w (keys %{$hcorrelgroup{$v}}){ + if($w eq $i){ + $count = 1; + push(@tdelete, $v); + } + } + } + if($count == 1){ + foreach my $w (keys %{$hcorrelgroup{$v}}){ + $hcorrelgroup{$k}{$w}=$hcorrelgroup{$v}{$w}; + } + delete($hcorrelgroup{$v}); + } + } + } +} + +foreach my $t (@tdelete){ + delete($hcorrelgroup{$t}); +} + +print "Join groups that have been subdivided done\n"; + +####################################################### +### Addition of annotation information among groups ### +####################################################### + +foreach my $k (keys %hcorrelgroup){ + foreach my $i (keys %{$hcorrelgroup{$k}}){ + foreach my $j (keys %{$hcorrelgroup{$k}}){ + my $count = 0; + if ($i ne $j){ + + my $a = $hrtmz{$i}{mz}; + my $b = $hrtmz{$j}{mz}; + + my $diff = $a - $b; + my $sign; + if($diff>0){ + $sign="+"; + } + if($diff<0){ + $sign="-"; + } + $diff = abs($diff); + + foreach my $z (keys %{$refhmass}){ + + foreach my $y (keys %{$refhmass->{$z}}){ + my $ym = $y - $mass_threshold; + my $yp = $y + $mass_threshold; + + + if(($diff <= $yp) && ($diff >= $ym)){ + my $diff_list = $diff - $y; + $diff_list = abs($diff_list); + $diff_list = sprintf ("%0.6f", $diff_list); + + if($hcorrelgroup{$k}{$i} eq 1){ + my $val = "@".$j."|".$sign."(".$z.")(".$diff_list.")|"; + $hcorrelgroup{$k}{$i}=$val; + $count ++; + } + else{ + if($count == 0){ + my $val = "@".$j."|".$sign."(".$z.")(".$diff_list.")|"; + $hcorrelgroup{$k}{$i}.=$val; + $count ++; + } + else{ + my $val = $sign."(".$z.")(".$diff_list.")|"; + $hcorrelgroup{$k}{$i}.=$val; + $count ++; + } + } + } + } + } + } + } + } +} + + +print "Addition of annotation information among groups done\n"; + + +#################################################### +### Choose the representative ion for each group ### +#################################################### + +my %hgrouprepres; + +open(F3, $dataMatrix); + +while (my $line = <F3>){ + chomp $line; + + my @tline = split (/\t/, $line); + + foreach my $k (keys %hcorrelgroup){ + foreach my $i (keys %{$hcorrelgroup{$k}}){ + if($tline[0] eq $i){ + $hgrouprepres{$k}{$i}{mass}=$hrtmz{$tline[0]}{mz}; + my $intensity; + my $nbsubjects=0; + for(my $y=1;$y<scalar(@tline);$y++){ + $intensity += $tline[$y]; + $nbsubjects ++; + } + my $meanintensity = $intensity/$nbsubjects; + $hgrouprepres{$k}{$i}{intensity}=$meanintensity; + $hgrouprepres{$k}{$i}{squaredmassint}=($hgrouprepres{$k}{$i}{mass}**2)/($hgrouprepres{$k}{$i}{intensity}); + } + } + } +} +close F3; + +foreach my $z (keys %hgrouprepres){ + my $max_intensity = 0; + my $max_int_ion = ""; + my $max_mass = 0; + my $max_mass_ion = ""; + my $max_squared = 0; + my $max_squared_ion = ""; + foreach my $w (keys %{$hgrouprepres{$z}}){ + if($hgrouprepres{$z}{$w}{intensity} > $max_intensity){ + $max_intensity = $hgrouprepres{$z}{$w}{intensity}; + $max_int_ion = $w; + } + if($hgrouprepres{$z}{$w}{mass} > $max_mass){ + $max_mass = $hgrouprepres{$z}{$w}{mass}; + $max_mass_ion = $w; + } + if($hgrouprepres{$z}{$w}{squaredmassint} > $max_squared){ + $max_squared = $hgrouprepres{$z}{$w}{squaredmassint}; + $max_squared_ion = $w; + } + } + + my $max_int_max_mass_ion=""; + + if($repres_opt eq "max_intensity_max_mass"){ + my %hfirst; + my $first=0; + foreach my $w (reverse sort {$hgrouprepres{$z}{$a}{intensity} <=> $hgrouprepres{$z}{$b}{intensity} } keys %{$hgrouprepres{$z}}){ + $first ++; + if ($first <= 3){ + $hfirst{$w} = $hgrouprepres{$z}{$w}{intensity}; + } + } + + my $first_2 = 0; + my $intens_max = 0; + my $mass_max = 0; + + foreach my $y (reverse sort {$hfirst{$a} <=> $hfirst{$b}} keys %hfirst){ + + $first_2 ++; + if($first_2 == 1){ + $intens_max = $hfirst{$y}; + if($intensity_threshold > $intens_max){ + $intensity_threshold = 0; + } + $max_int_max_mass_ion = $y; + $mass_max = $hgrouprepres{$z}{$y}{mass}; + } + if($hgrouprepres{$z}{$y}{mass} > $mass_max){ + if($hfirst{$y}>$intensity_threshold){ + my $a = $intens_max * $intensity_pourc; + if($hfirst{$y} > $a){ + $max_int_max_mass_ion = $y; + $mass_max = $hgrouprepres{$z}{$y}{mass}; + } + } + } + } + } + + $hgrouprepres{$z}{max_int}=$max_int_ion; + $hgrouprepres{$z}{max_mass}=$max_mass_ion; + $hgrouprepres{$z}{max_squared}=$max_squared_ion; + $hgrouprepres{$z}{max_int_max_mass}=$max_int_max_mass_ion; + +} + + +print "Choose the representative ion for each group done\n"; + +############################################################################# +### Addition of annotation information relative to the representative ion ### +############################################################################# + +my %hreprescomparison; + +my $representative=""; + +if($opt != 1){ + foreach my $k (keys %hcorrelgroup){ + foreach my $i (keys %{$hcorrelgroup{$k}}){ + + if($repres_opt eq "mass"){$representative = $hgrouprepres{$k}{max_mass}} + if($repres_opt eq "intensity"){$representative = $hgrouprepres{$k}{max_int}} + if($repres_opt eq "mixt"){$representative = $hgrouprepres{$k}{max_squared}} + if($repres_opt eq "max_intensity_max_mass"){$representative = $hgrouprepres{$k}{max_int_max_mass}} + + + my $count = 0; + if ($i ne $representative){ + + my $a = $hrtmz{$i}{mz}; + my $b = $hrtmz{$representative}{mz}; + + my $diff = $a - $b; + my $sign; + if($diff>0){ + $sign="+"; + } + if($diff<0){ + $sign="-"; + } + $diff = abs($diff); + + foreach my $z (keys %{$refhmass}){ + + foreach my $y (keys %{$refhmass->{$z}}){ + my $ym = $y - $mass_threshold; + my $yp = $y + $mass_threshold; + + if(($diff <= $yp) && ($diff >= $ym)){ + my $diff_list = $diff - $y; + $diff_list = abs($diff_list); + $diff_list = sprintf ("%0.4f", $diff_list); + if($hcorrelgroup{$k}{$i} eq 1){ + my $valrep = "[M ".$sign."(".$z.")]|"; + $hreprescomparison{$k}{$i}{repres_diff}=$valrep; + $count ++; + } + else{ + if($count == 0){ + my $valrep = "[M ".$sign."(".$z.")]|"; + $hreprescomparison{$k}{$i}{repres_diff}.=$valrep; + $count ++; + } + else{ + my $valrep = "[M ".$sign."(".$z.")]|"; + $hreprescomparison{$k}{$i}{repres_diff}.=$valrep; + $count ++; + } + } + } + } + } + } + else{ + $hreprescomparison{$k}{$i}{repres_diff}="M"; + } + } + } +} + + +print "Addition of annotation information relative to the representative ion done\n"; + +############################## +### Print in result file ! ### +############################## + +open(F4, ">$output_tabular"); +open(F5, $combined_DMVM); + +my $line_nb = 0; +my %hheader; +while (my $line = <F5>){ + chomp $line; + + + my @tline = split (/\t/, $line); + + if($line_nb == 0){ + print F4 "$line\tACorF_groups"; + if($opt == 1){ + if($repres_opt eq "intensity"){print F4 "\tACorF_filter\tintensity_repres\n"} + if($repres_opt eq "mass"){print F4 "\tACorF_filter\tmass_repres\n"} + if($repres_opt eq "mixt"){print F4 "\tACorF_filter\tmass2intens_repres\n"} + if($repres_opt eq "max_intensity_max_mass"){print F4 "\tACorF_filter\tmax_intensity_max_mass_repres\n"} + } + else{ + if($repres_opt eq "intensity"){print F4 "\tisotopes_adducts_fragments_[\@id|annotation(delta_annotation)]\tACorF_filter\tintensity_repres\tannotation_relative_to_representative\n"} + if($repres_opt eq "mass"){print F4 "\tisotopes_adducts_fragments_[\@id|annotation(delta_annotation)]\tACorF_filter\tmass_repres\tannotation_relative_to_representative\n"} + if($repres_opt eq "mixt"){print F4 "\tisotopes_adducts_fragments_[\@id|annotation(delta_annotation)]\tACorF_filter\tmass2intens_repres\tannotation_relative_to_representative\n"} + if($repres_opt eq "max_intensity_max_mass"){print F4 "\tisotopes_adducts_fragments_[\@id|annotation(delta_annotation)]\tACorF_filter\tmax_intensity_max_mass_repres\tannotation_relative_to_representative\n"} + } + + + ### Creation of a header hash + for(my $i=0; $i<scalar(@tline);$i++){ + my $a = $tline[$i]; + $hheader{$a}=$i; + } + } + + else{ + my $find = 0; + foreach my $v (keys %hcorrelgroup){ + if(defined($hgrouprepres{$v}{$tline[0]})){ + print F4 "$line\t$v"; + + if($opt != 1){ + if(defined($hcorrelgroup{$v}{$tline[0]})){ + print F4 "\t$hcorrelgroup{$v}{$tline[0]}\t"; + + } + else{ + print F4 "\t"; + } + } + + if($repres_opt eq "intensity"){ + if($tline[0] eq $hgrouprepres{$v}{max_int}){ + print F4 "1\t"; + } + else{ + print F4 "0\t"; + } + $find = 1; + } + if($repres_opt eq "mass"){ + if($tline[0] eq $hgrouprepres{$v}{max_mass}){ + print F4 "1\t"; + } + else{ + print F4 "0\t"; + } + $find = 1; + } + if($repres_opt eq "mixt"){ + if($tline[0] eq $hgrouprepres{$v}{max_squared}){ + print F4 "1\t"; + } + else{ + print F4 "0\t"; + } + $find = 1; + } + if($repres_opt eq "max_intensity_max_mass"){ + if($tline[0] eq $hgrouprepres{$v}{max_int_max_mass}){ + print F4 "1\t"; + } + else{ + print F4 "0\t"; + } + $find = 1; + } + + if($repres_opt eq "intensity"){print F4 "$hgrouprepres{$v}{max_int}\t"} + if($repres_opt eq "mass"){print F4 "$hgrouprepres{$v}{max_mass}\t"} + if($repres_opt eq "mixt"){print F4 "$hgrouprepres{$v}{max_squared}\t"} + if($repres_opt eq "max_intensity_max_mass"){print F4 "$hgrouprepres{$v}{max_int_max_mass}\t"} + + if(defined($hreprescomparison{$v}{$tline[0]}{repres_diff})){ + print F4 "$hreprescomparison{$v}{$tline[0]}{repres_diff}\n"; + } + else{ + print F4 "-\n"; + } + } + } + if($find == 0){ + $groupct ++; + my $group = "group".$groupct; + if($opt != 1){ + print F4 "$line\t$group\t-\t-\t-\t-\n"; + } + else{ + print F4 "$line\t$group\t-\t-\n"; + } + } + } + $line_nb ++; +} + +print "Print in result file done\n"; + +print "All steps done\n"; +