Mercurial > repos > mir-bioinf > fix_excel_date_symbols
diff fix_excel_date_symbols/genes_not_dates_toolshed.pl @ 0:7271825cb6c8 draft default tip
Initial upload
author | mir-bioinf |
---|---|
date | Tue, 28 Apr 2015 12:30:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fix_excel_date_symbols/genes_not_dates_toolshed.pl Tue Apr 28 12:30:53 2015 -0400 @@ -0,0 +1,119 @@ +#!/usr/bin/perl + +use strict; +use warnings; +#use POSIX; + +use Getopt::Long; +use Pod::Usage; + +my $log=''; +my $data_in=''; +my $geneCols=''; +my $out_file=''; +my $spec=''; +my $lookupCol=''; + + +GetOptions( + "log=s" => \$log, + "expfile=s" => \$data_in, + "cols=s" => \$geneCols, ##want to specify columns otherwise if user wants to preserve actual dates anywhere they'll get replaced + "resultsfile=s" => \$out_file, + "species=s" => \$spec, + "lookup=s" => \$lookupCol, ##this could be empty +# "h|help" => \$help +) or pod2usage( -exitval => 2, -verbose => 2 ); + + +#check parameters and options +my $debug = scalar(@ARGV); +use IO::Handle; +open OUTPUT, '>',$log or die "cant open this file for OUTPUT: $log. Computer says: $!\n"; +open(my $results,'>',$out_file) or die "cannot open results file $out_file: $!\n"; +open(my $allexpr, "<", $data_in) or die "Cannot open input file $data_in: $!\n"; +my @Expression = <$allexpr>; +close($allexpr); + + +my @geneCols_ary = (split(',', $geneCols)); +my $numCols = scalar @geneCols_ary; + +if ($lookupCol) {print OUTPUT "User specified second identifier col for 1/2-Mar genes.\n\n";} ##DEBUG + +my $human_yes = 0; ##initialize human switch to 0 (default is mouse, otherwise need to convert symbol to uppercase) +if ($spec eq "human") { + $human_yes = 1; +} +my $current2ndLookup_noquotes; +my $current2ndLookup; +for (my $i=0; $i<scalar @Expression; $i++) { + my $tmp = scalar @Expression; + my @linetmp = split('\t', $Expression[$i]); + $linetmp[-1] = substr($linetmp[-1],0,-1); ##get rid of newline in last piece; will mess up matching + if ($lookupCol) { + ##NEED TO ACCOUNT FOR COMMA-DELIMITED LISTS + $current2ndLookup = $linetmp[$lookupCol-1]; ##This is 2nd gene identifier + $current2ndLookup =~ s/"//g; ##Remove quotes if they're there + my @stuff = split(',',$current2ndLookup); ##Need to consider comma-delim list (fairly common) + $current2ndLookup = $stuff[0]; ##First in list should be somewhere in lookup file + } + + for (my $j=0; $j<$numCols; $j++) { ##IF $LOOKUP THEN NUMCOLS WILL BE 1 AND ONLY ONE TIME THROUGH LOOP + my $currentGene = $linetmp[$geneCols_ary[$j]-1]; + $currentGene =~ s/"//g; ##Might have quotes here too + + my $match = qx(cat ./genesymbol_dateLUT.tab | awk '\$1 == "$currentGene"'); ##10-8-14 change + my $debugL = length $match; + my @matchAry; + if ($debugL>0) { ##FOUND IN THE FIRST LIST + @matchAry = split('\t',$match); + $match =~ s/ //g; ##Try to fix the ^Ms at ends of lines + } else { ##CHECK IF THEY'RE 1-MAR OR 2-MAR: + if ($lookupCol) { + if ($human_yes == 1) { + $match = qx(cat ./Mar1_2_LUT_human.txt | awk '\$1 == "$currentGene"' | awk '\$2 == "$current2ndLookup"'); + } else { + $match = qx(cat ./Mar1_2_LUT_mouse.txt | awk '\$1 == "$currentGene"' | awk '\$2 == "$current2ndLookup"'); + } + @matchAry = split('\t',$match); + } + } + $debugL = length $match; + if ($debugL > 0) { + my $blah; + if ($human_yes == 1) { ##Replace date with gene symbol (2nd col in file) + $blah = uc substr($matchAry[-1],0,-1); ##SHOULD BE ALWAYS LAST THING IN THE ROW + $blah =~ s/ //g; + $linetmp[$geneCols_ary[$j]-1] = $blah; ##SHOULD BE ALWAYS LAST THING IN THE ROW + print OUTPUT "Match found for $currentGene, replacing with $linetmp[$geneCols_ary[$j]-1]\n"; + } else { + $blah = substr($matchAry[-1],0,-1); + $blah =~ s/ //g; + $linetmp[$geneCols_ary[$j]-1] = $blah; + print OUTPUT "Match found for $currentGene, replacing with $linetmp[$geneCols_ary[$j]-1]\n"; + } + + } else { + ##GIVE SOME OUTPUT TO HINT USER IF GENES ARE 1/2-MAR (REGARDLESS OF WHAT WAS CHOSEN). THIS WILL SLOW CODE DOWN THOUGH... + my $match_h = qx(cat ./Mar1_2_LUT_human.txt | awk '\$1 == "$currentGene"'); + my $match_m = qx(cat ./Mar1_2_LUT_mouse.txt | awk '\$1 == "$currentGene"'); + my $debugL_h = length $match_h; + my $debugL_m = length $match_m; + if ( ($debugL_h>0) || ($debugL_m>0) ) { ##We have a 1/2-Mar gene but can't fix it + print OUTPUT "In file is $currentGene. Cannot replace because "; + if ($lookupCol) { + print OUTPUT "second identifier, $current2ndLookup, is not in reference file.\n"; + } else { + print OUTPUT "no second identifier column was specified.\n"; + } + } + } + + + } + print $results join("\t",@linetmp),"\n"; +} + +close $results; +close OUTPUT;