annotate fix_excel_date_symbols/genes_not_dates_toolshed.pl @ 0:7271825cb6c8 draft default tip

Initial upload
author mir-bioinf
date Tue, 28 Apr 2015 12:30:53 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
1 #!/usr/bin/perl
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
2
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
3 use strict;
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
4 use warnings;
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
5 #use POSIX;
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
6
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
7 use Getopt::Long;
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
8 use Pod::Usage;
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
9
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
10 my $log='';
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
11 my $data_in='';
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
12 my $geneCols='';
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
13 my $out_file='';
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
14 my $spec='';
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
15 my $lookupCol='';
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
16
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
17
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
18 GetOptions(
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
19 "log=s" => \$log,
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
20 "expfile=s" => \$data_in,
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
21 "cols=s" => \$geneCols, ##want to specify columns otherwise if user wants to preserve actual dates anywhere they'll get replaced
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
22 "resultsfile=s" => \$out_file,
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
23 "species=s" => \$spec,
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
24 "lookup=s" => \$lookupCol, ##this could be empty
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
25 # "h|help" => \$help
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
26 ) or pod2usage( -exitval => 2, -verbose => 2 );
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
27
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
28
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
29 #check parameters and options
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
30 my $debug = scalar(@ARGV);
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
31 use IO::Handle;
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
32 open OUTPUT, '>',$log or die "cant open this file for OUTPUT: $log. Computer says: $!\n";
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
33 open(my $results,'>',$out_file) or die "cannot open results file $out_file: $!\n";
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
34 open(my $allexpr, "<", $data_in) or die "Cannot open input file $data_in: $!\n";
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
35 my @Expression = <$allexpr>;
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
36 close($allexpr);
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
37
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
38
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
39 my @geneCols_ary = (split(',', $geneCols));
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
40 my $numCols = scalar @geneCols_ary;
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
41
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
42 if ($lookupCol) {print OUTPUT "User specified second identifier col for 1/2-Mar genes.\n\n";} ##DEBUG
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
43
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
44 my $human_yes = 0; ##initialize human switch to 0 (default is mouse, otherwise need to convert symbol to uppercase)
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
45 if ($spec eq "human") {
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
46 $human_yes = 1;
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
47 }
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
48 my $current2ndLookup_noquotes;
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
49 my $current2ndLookup;
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
50 for (my $i=0; $i<scalar @Expression; $i++) {
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
51 my $tmp = scalar @Expression;
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
52 my @linetmp = split('\t', $Expression[$i]);
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
53 $linetmp[-1] = substr($linetmp[-1],0,-1); ##get rid of newline in last piece; will mess up matching
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
54 if ($lookupCol) {
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
55 ##NEED TO ACCOUNT FOR COMMA-DELIMITED LISTS
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
56 $current2ndLookup = $linetmp[$lookupCol-1]; ##This is 2nd gene identifier
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
57 $current2ndLookup =~ s/"//g; ##Remove quotes if they're there
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
58 my @stuff = split(',',$current2ndLookup); ##Need to consider comma-delim list (fairly common)
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
59 $current2ndLookup = $stuff[0]; ##First in list should be somewhere in lookup file
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
60 }
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
61
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
62 for (my $j=0; $j<$numCols; $j++) { ##IF $LOOKUP THEN NUMCOLS WILL BE 1 AND ONLY ONE TIME THROUGH LOOP
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
63 my $currentGene = $linetmp[$geneCols_ary[$j]-1];
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
64 $currentGene =~ s/"//g; ##Might have quotes here too
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
65
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
66 my $match = qx(cat ./genesymbol_dateLUT.tab | awk '\$1 == "$currentGene"'); ##10-8-14 change
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
67 my $debugL = length $match;
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
68 my @matchAry;
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
69 if ($debugL>0) { ##FOUND IN THE FIRST LIST
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
70 @matchAry = split('\t',$match);
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
71 $match =~ s/ //g; ##Try to fix the ^Ms at ends of lines
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
72 } else { ##CHECK IF THEY'RE 1-MAR OR 2-MAR:
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
73 if ($lookupCol) {
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
74 if ($human_yes == 1) {
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
75 $match = qx(cat ./Mar1_2_LUT_human.txt | awk '\$1 == "$currentGene"' | awk '\$2 == "$current2ndLookup"');
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
76 } else {
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
77 $match = qx(cat ./Mar1_2_LUT_mouse.txt | awk '\$1 == "$currentGene"' | awk '\$2 == "$current2ndLookup"');
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
78 }
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
79 @matchAry = split('\t',$match);
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
80 }
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
81 }
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
82 $debugL = length $match;
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
83 if ($debugL > 0) {
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
84 my $blah;
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
85 if ($human_yes == 1) { ##Replace date with gene symbol (2nd col in file)
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
86 $blah = uc substr($matchAry[-1],0,-1); ##SHOULD BE ALWAYS LAST THING IN THE ROW
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
87 $blah =~ s/ //g;
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
88 $linetmp[$geneCols_ary[$j]-1] = $blah; ##SHOULD BE ALWAYS LAST THING IN THE ROW
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
89 print OUTPUT "Match found for $currentGene, replacing with $linetmp[$geneCols_ary[$j]-1]\n";
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
90 } else {
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
91 $blah = substr($matchAry[-1],0,-1);
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
92 $blah =~ s/ //g;
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
93 $linetmp[$geneCols_ary[$j]-1] = $blah;
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
94 print OUTPUT "Match found for $currentGene, replacing with $linetmp[$geneCols_ary[$j]-1]\n";
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
95 }
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
96
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
97 } else {
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
98 ##GIVE SOME OUTPUT TO HINT USER IF GENES ARE 1/2-MAR (REGARDLESS OF WHAT WAS CHOSEN). THIS WILL SLOW CODE DOWN THOUGH...
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
99 my $match_h = qx(cat ./Mar1_2_LUT_human.txt | awk '\$1 == "$currentGene"');
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
100 my $match_m = qx(cat ./Mar1_2_LUT_mouse.txt | awk '\$1 == "$currentGene"');
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
101 my $debugL_h = length $match_h;
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
102 my $debugL_m = length $match_m;
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
103 if ( ($debugL_h>0) || ($debugL_m>0) ) { ##We have a 1/2-Mar gene but can't fix it
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
104 print OUTPUT "In file is $currentGene. Cannot replace because ";
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
105 if ($lookupCol) {
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
106 print OUTPUT "second identifier, $current2ndLookup, is not in reference file.\n";
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
107 } else {
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
108 print OUTPUT "no second identifier column was specified.\n";
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
109 }
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
110 }
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
111 }
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
112
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
113
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
114 }
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
115 print $results join("\t",@linetmp),"\n";
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
116 }
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
117
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
118 close $results;
7271825cb6c8 Initial upload
mir-bioinf
parents:
diff changeset
119 close OUTPUT;