comparison fix_excel_date_symbols/genes_not_dates_toolshed.pl @ 0:7271825cb6c8 draft default tip

Initial upload
author mir-bioinf
date Tue, 28 Apr 2015 12:30:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:7271825cb6c8
1 #!/usr/bin/perl
2
3 use strict;
4 use warnings;
5 #use POSIX;
6
7 use Getopt::Long;
8 use Pod::Usage;
9
10 my $log='';
11 my $data_in='';
12 my $geneCols='';
13 my $out_file='';
14 my $spec='';
15 my $lookupCol='';
16
17
18 GetOptions(
19 "log=s" => \$log,
20 "expfile=s" => \$data_in,
21 "cols=s" => \$geneCols, ##want to specify columns otherwise if user wants to preserve actual dates anywhere they'll get replaced
22 "resultsfile=s" => \$out_file,
23 "species=s" => \$spec,
24 "lookup=s" => \$lookupCol, ##this could be empty
25 # "h|help" => \$help
26 ) or pod2usage( -exitval => 2, -verbose => 2 );
27
28
29 #check parameters and options
30 my $debug = scalar(@ARGV);
31 use IO::Handle;
32 open OUTPUT, '>',$log or die "cant open this file for OUTPUT: $log. Computer says: $!\n";
33 open(my $results,'>',$out_file) or die "cannot open results file $out_file: $!\n";
34 open(my $allexpr, "<", $data_in) or die "Cannot open input file $data_in: $!\n";
35 my @Expression = <$allexpr>;
36 close($allexpr);
37
38
39 my @geneCols_ary = (split(',', $geneCols));
40 my $numCols = scalar @geneCols_ary;
41
42 if ($lookupCol) {print OUTPUT "User specified second identifier col for 1/2-Mar genes.\n\n";} ##DEBUG
43
44 my $human_yes = 0; ##initialize human switch to 0 (default is mouse, otherwise need to convert symbol to uppercase)
45 if ($spec eq "human") {
46 $human_yes = 1;
47 }
48 my $current2ndLookup_noquotes;
49 my $current2ndLookup;
50 for (my $i=0; $i<scalar @Expression; $i++) {
51 my $tmp = scalar @Expression;
52 my @linetmp = split('\t', $Expression[$i]);
53 $linetmp[-1] = substr($linetmp[-1],0,-1); ##get rid of newline in last piece; will mess up matching
54 if ($lookupCol) {
55 ##NEED TO ACCOUNT FOR COMMA-DELIMITED LISTS
56 $current2ndLookup = $linetmp[$lookupCol-1]; ##This is 2nd gene identifier
57 $current2ndLookup =~ s/"//g; ##Remove quotes if they're there
58 my @stuff = split(',',$current2ndLookup); ##Need to consider comma-delim list (fairly common)
59 $current2ndLookup = $stuff[0]; ##First in list should be somewhere in lookup file
60 }
61
62 for (my $j=0; $j<$numCols; $j++) { ##IF $LOOKUP THEN NUMCOLS WILL BE 1 AND ONLY ONE TIME THROUGH LOOP
63 my $currentGene = $linetmp[$geneCols_ary[$j]-1];
64 $currentGene =~ s/"//g; ##Might have quotes here too
65
66 my $match = qx(cat ./genesymbol_dateLUT.tab | awk '\$1 == "$currentGene"'); ##10-8-14 change
67 my $debugL = length $match;
68 my @matchAry;
69 if ($debugL>0) { ##FOUND IN THE FIRST LIST
70 @matchAry = split('\t',$match);
71 $match =~ s/
72 //g; ##Try to fix the ^Ms at ends of lines
73 } else { ##CHECK IF THEY'RE 1-MAR OR 2-MAR:
74 if ($lookupCol) {
75 if ($human_yes == 1) {
76 $match = qx(cat ./Mar1_2_LUT_human.txt | awk '\$1 == "$currentGene"' | awk '\$2 == "$current2ndLookup"');
77 } else {
78 $match = qx(cat ./Mar1_2_LUT_mouse.txt | awk '\$1 == "$currentGene"' | awk '\$2 == "$current2ndLookup"');
79 }
80 @matchAry = split('\t',$match);
81 }
82 }
83 $debugL = length $match;
84 if ($debugL > 0) {
85 my $blah;
86 if ($human_yes == 1) { ##Replace date with gene symbol (2nd col in file)
87 $blah = uc substr($matchAry[-1],0,-1); ##SHOULD BE ALWAYS LAST THING IN THE ROW
88 $blah =~ s/
89 //g;
90 $linetmp[$geneCols_ary[$j]-1] = $blah; ##SHOULD BE ALWAYS LAST THING IN THE ROW
91 print OUTPUT "Match found for $currentGene, replacing with $linetmp[$geneCols_ary[$j]-1]\n";
92 } else {
93 $blah = substr($matchAry[-1],0,-1);
94 $blah =~ s/
95 //g;
96 $linetmp[$geneCols_ary[$j]-1] = $blah;
97 print OUTPUT "Match found for $currentGene, replacing with $linetmp[$geneCols_ary[$j]-1]\n";
98 }
99
100 } else {
101 ##GIVE SOME OUTPUT TO HINT USER IF GENES ARE 1/2-MAR (REGARDLESS OF WHAT WAS CHOSEN). THIS WILL SLOW CODE DOWN THOUGH...
102 my $match_h = qx(cat ./Mar1_2_LUT_human.txt | awk '\$1 == "$currentGene"');
103 my $match_m = qx(cat ./Mar1_2_LUT_mouse.txt | awk '\$1 == "$currentGene"');
104 my $debugL_h = length $match_h;
105 my $debugL_m = length $match_m;
106 if ( ($debugL_h>0) || ($debugL_m>0) ) { ##We have a 1/2-Mar gene but can't fix it
107 print OUTPUT "In file is $currentGene. Cannot replace because ";
108 if ($lookupCol) {
109 print OUTPUT "second identifier, $current2ndLookup, is not in reference file.\n";
110 } else {
111 print OUTPUT "no second identifier column was specified.\n";
112 }
113 }
114 }
115
116
117 }
118 print $results join("\t",@linetmp),"\n";
119 }
120
121 close $results;
122 close OUTPUT;