Mercurial > repos > mir-bioinf > fix_excel_date_symbols
view fix_excel_date_symbols/genes_not_dates_toolshed.pl @ 0:7271825cb6c8 draft default tip
Initial upload
author | mir-bioinf |
---|---|
date | Tue, 28 Apr 2015 12:30:53 -0400 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/perl use strict; use warnings; #use POSIX; use Getopt::Long; use Pod::Usage; my $log=''; my $data_in=''; my $geneCols=''; my $out_file=''; my $spec=''; my $lookupCol=''; GetOptions( "log=s" => \$log, "expfile=s" => \$data_in, "cols=s" => \$geneCols, ##want to specify columns otherwise if user wants to preserve actual dates anywhere they'll get replaced "resultsfile=s" => \$out_file, "species=s" => \$spec, "lookup=s" => \$lookupCol, ##this could be empty # "h|help" => \$help ) or pod2usage( -exitval => 2, -verbose => 2 ); #check parameters and options my $debug = scalar(@ARGV); use IO::Handle; open OUTPUT, '>',$log or die "cant open this file for OUTPUT: $log. Computer says: $!\n"; open(my $results,'>',$out_file) or die "cannot open results file $out_file: $!\n"; open(my $allexpr, "<", $data_in) or die "Cannot open input file $data_in: $!\n"; my @Expression = <$allexpr>; close($allexpr); my @geneCols_ary = (split(',', $geneCols)); my $numCols = scalar @geneCols_ary; if ($lookupCol) {print OUTPUT "User specified second identifier col for 1/2-Mar genes.\n\n";} ##DEBUG my $human_yes = 0; ##initialize human switch to 0 (default is mouse, otherwise need to convert symbol to uppercase) if ($spec eq "human") { $human_yes = 1; } my $current2ndLookup_noquotes; my $current2ndLookup; for (my $i=0; $i<scalar @Expression; $i++) { my $tmp = scalar @Expression; my @linetmp = split('\t', $Expression[$i]); $linetmp[-1] = substr($linetmp[-1],0,-1); ##get rid of newline in last piece; will mess up matching if ($lookupCol) { ##NEED TO ACCOUNT FOR COMMA-DELIMITED LISTS $current2ndLookup = $linetmp[$lookupCol-1]; ##This is 2nd gene identifier $current2ndLookup =~ s/"//g; ##Remove quotes if they're there my @stuff = split(',',$current2ndLookup); ##Need to consider comma-delim list (fairly common) $current2ndLookup = $stuff[0]; ##First in list should be somewhere in lookup file } for (my $j=0; $j<$numCols; $j++) { ##IF $LOOKUP THEN NUMCOLS WILL BE 1 AND ONLY ONE TIME THROUGH LOOP my $currentGene = $linetmp[$geneCols_ary[$j]-1]; $currentGene =~ s/"//g; ##Might have quotes here too my $match = qx(cat ./genesymbol_dateLUT.tab | awk '\$1 == "$currentGene"'); ##10-8-14 change my $debugL = length $match; my @matchAry; if ($debugL>0) { ##FOUND IN THE FIRST LIST @matchAry = split('\t',$match); $match =~ s/ //g; ##Try to fix the ^Ms at ends of lines } else { ##CHECK IF THEY'RE 1-MAR OR 2-MAR: if ($lookupCol) { if ($human_yes == 1) { $match = qx(cat ./Mar1_2_LUT_human.txt | awk '\$1 == "$currentGene"' | awk '\$2 == "$current2ndLookup"'); } else { $match = qx(cat ./Mar1_2_LUT_mouse.txt | awk '\$1 == "$currentGene"' | awk '\$2 == "$current2ndLookup"'); } @matchAry = split('\t',$match); } } $debugL = length $match; if ($debugL > 0) { my $blah; if ($human_yes == 1) { ##Replace date with gene symbol (2nd col in file) $blah = uc substr($matchAry[-1],0,-1); ##SHOULD BE ALWAYS LAST THING IN THE ROW $blah =~ s/ //g; $linetmp[$geneCols_ary[$j]-1] = $blah; ##SHOULD BE ALWAYS LAST THING IN THE ROW print OUTPUT "Match found for $currentGene, replacing with $linetmp[$geneCols_ary[$j]-1]\n"; } else { $blah = substr($matchAry[-1],0,-1); $blah =~ s/ //g; $linetmp[$geneCols_ary[$j]-1] = $blah; print OUTPUT "Match found for $currentGene, replacing with $linetmp[$geneCols_ary[$j]-1]\n"; } } else { ##GIVE SOME OUTPUT TO HINT USER IF GENES ARE 1/2-MAR (REGARDLESS OF WHAT WAS CHOSEN). THIS WILL SLOW CODE DOWN THOUGH... my $match_h = qx(cat ./Mar1_2_LUT_human.txt | awk '\$1 == "$currentGene"'); my $match_m = qx(cat ./Mar1_2_LUT_mouse.txt | awk '\$1 == "$currentGene"'); my $debugL_h = length $match_h; my $debugL_m = length $match_m; if ( ($debugL_h>0) || ($debugL_m>0) ) { ##We have a 1/2-Mar gene but can't fix it print OUTPUT "In file is $currentGene. Cannot replace because "; if ($lookupCol) { print OUTPUT "second identifier, $current2ndLookup, is not in reference file.\n"; } else { print OUTPUT "no second identifier column was specified.\n"; } } } } print $results join("\t",@linetmp),"\n"; } close $results; close OUTPUT;