0
|
1 #!/usr/bin/perl
|
|
2
|
|
3 use strict;
|
|
4 use warnings;
|
|
5 #use POSIX;
|
|
6
|
|
7 use Getopt::Long;
|
|
8 use Pod::Usage;
|
|
9
|
|
10 my $log='';
|
|
11 my $data_in='';
|
|
12 my $geneCols='';
|
|
13 my $out_file='';
|
|
14 my $spec='';
|
|
15 my $lookupCol='';
|
|
16
|
|
17
|
|
18 GetOptions(
|
|
19 "log=s" => \$log,
|
|
20 "expfile=s" => \$data_in,
|
|
21 "cols=s" => \$geneCols, ##want to specify columns otherwise if user wants to preserve actual dates anywhere they'll get replaced
|
|
22 "resultsfile=s" => \$out_file,
|
|
23 "species=s" => \$spec,
|
|
24 "lookup=s" => \$lookupCol, ##this could be empty
|
|
25 # "h|help" => \$help
|
|
26 ) or pod2usage( -exitval => 2, -verbose => 2 );
|
|
27
|
|
28
|
|
29 #check parameters and options
|
|
30 my $debug = scalar(@ARGV);
|
|
31 use IO::Handle;
|
|
32 open OUTPUT, '>',$log or die "cant open this file for OUTPUT: $log. Computer says: $!\n";
|
|
33 open(my $results,'>',$out_file) or die "cannot open results file $out_file: $!\n";
|
|
34 open(my $allexpr, "<", $data_in) or die "Cannot open input file $data_in: $!\n";
|
|
35 my @Expression = <$allexpr>;
|
|
36 close($allexpr);
|
|
37
|
|
38
|
|
39 my @geneCols_ary = (split(',', $geneCols));
|
|
40 my $numCols = scalar @geneCols_ary;
|
|
41
|
|
42 if ($lookupCol) {print OUTPUT "User specified second identifier col for 1/2-Mar genes.\n\n";} ##DEBUG
|
|
43
|
|
44 my $human_yes = 0; ##initialize human switch to 0 (default is mouse, otherwise need to convert symbol to uppercase)
|
|
45 if ($spec eq "human") {
|
|
46 $human_yes = 1;
|
|
47 }
|
|
48 my $current2ndLookup_noquotes;
|
|
49 my $current2ndLookup;
|
|
50 for (my $i=0; $i<scalar @Expression; $i++) {
|
|
51 my $tmp = scalar @Expression;
|
|
52 my @linetmp = split('\t', $Expression[$i]);
|
|
53 $linetmp[-1] = substr($linetmp[-1],0,-1); ##get rid of newline in last piece; will mess up matching
|
|
54 if ($lookupCol) {
|
|
55 ##NEED TO ACCOUNT FOR COMMA-DELIMITED LISTS
|
|
56 $current2ndLookup = $linetmp[$lookupCol-1]; ##This is 2nd gene identifier
|
|
57 $current2ndLookup =~ s/"//g; ##Remove quotes if they're there
|
|
58 my @stuff = split(',',$current2ndLookup); ##Need to consider comma-delim list (fairly common)
|
|
59 $current2ndLookup = $stuff[0]; ##First in list should be somewhere in lookup file
|
|
60 }
|
|
61
|
|
62 for (my $j=0; $j<$numCols; $j++) { ##IF $LOOKUP THEN NUMCOLS WILL BE 1 AND ONLY ONE TIME THROUGH LOOP
|
|
63 my $currentGene = $linetmp[$geneCols_ary[$j]-1];
|
|
64 $currentGene =~ s/"//g; ##Might have quotes here too
|
|
65
|
|
66 my $match = qx(cat ./genesymbol_dateLUT.tab | awk '\$1 == "$currentGene"'); ##10-8-14 change
|
|
67 my $debugL = length $match;
|
|
68 my @matchAry;
|
|
69 if ($debugL>0) { ##FOUND IN THE FIRST LIST
|
|
70 @matchAry = split('\t',$match);
|
|
71 $match =~ s/
//g; ##Try to fix the ^Ms at ends of lines
|
|
72 } else { ##CHECK IF THEY'RE 1-MAR OR 2-MAR:
|
|
73 if ($lookupCol) {
|
|
74 if ($human_yes == 1) {
|
|
75 $match = qx(cat ./Mar1_2_LUT_human.txt | awk '\$1 == "$currentGene"' | awk '\$2 == "$current2ndLookup"');
|
|
76 } else {
|
|
77 $match = qx(cat ./Mar1_2_LUT_mouse.txt | awk '\$1 == "$currentGene"' | awk '\$2 == "$current2ndLookup"');
|
|
78 }
|
|
79 @matchAry = split('\t',$match);
|
|
80 }
|
|
81 }
|
|
82 $debugL = length $match;
|
|
83 if ($debugL > 0) {
|
|
84 my $blah;
|
|
85 if ($human_yes == 1) { ##Replace date with gene symbol (2nd col in file)
|
|
86 $blah = uc substr($matchAry[-1],0,-1); ##SHOULD BE ALWAYS LAST THING IN THE ROW
|
|
87 $blah =~ s/
//g;
|
|
88 $linetmp[$geneCols_ary[$j]-1] = $blah; ##SHOULD BE ALWAYS LAST THING IN THE ROW
|
|
89 print OUTPUT "Match found for $currentGene, replacing with $linetmp[$geneCols_ary[$j]-1]\n";
|
|
90 } else {
|
|
91 $blah = substr($matchAry[-1],0,-1);
|
|
92 $blah =~ s/
//g;
|
|
93 $linetmp[$geneCols_ary[$j]-1] = $blah;
|
|
94 print OUTPUT "Match found for $currentGene, replacing with $linetmp[$geneCols_ary[$j]-1]\n";
|
|
95 }
|
|
96
|
|
97 } else {
|
|
98 ##GIVE SOME OUTPUT TO HINT USER IF GENES ARE 1/2-MAR (REGARDLESS OF WHAT WAS CHOSEN). THIS WILL SLOW CODE DOWN THOUGH...
|
|
99 my $match_h = qx(cat ./Mar1_2_LUT_human.txt | awk '\$1 == "$currentGene"');
|
|
100 my $match_m = qx(cat ./Mar1_2_LUT_mouse.txt | awk '\$1 == "$currentGene"');
|
|
101 my $debugL_h = length $match_h;
|
|
102 my $debugL_m = length $match_m;
|
|
103 if ( ($debugL_h>0) || ($debugL_m>0) ) { ##We have a 1/2-Mar gene but can't fix it
|
|
104 print OUTPUT "In file is $currentGene. Cannot replace because ";
|
|
105 if ($lookupCol) {
|
|
106 print OUTPUT "second identifier, $current2ndLookup, is not in reference file.\n";
|
|
107 } else {
|
|
108 print OUTPUT "no second identifier column was specified.\n";
|
|
109 }
|
|
110 }
|
|
111 }
|
|
112
|
|
113
|
|
114 }
|
|
115 print $results join("\t",@linetmp),"\n";
|
|
116 }
|
|
117
|
|
118 close $results;
|
|
119 close OUTPUT;
|