Mercurial > repos > mir-bioinf > fix_excel_date_symbols
comparison fix_excel_date_symbols/genes_not_dates_toolshed.pl @ 0:7271825cb6c8 draft default tip
Initial upload
author | mir-bioinf |
---|---|
date | Tue, 28 Apr 2015 12:30:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:7271825cb6c8 |
---|---|
1 #!/usr/bin/perl | |
2 | |
3 use strict; | |
4 use warnings; | |
5 #use POSIX; | |
6 | |
7 use Getopt::Long; | |
8 use Pod::Usage; | |
9 | |
10 my $log=''; | |
11 my $data_in=''; | |
12 my $geneCols=''; | |
13 my $out_file=''; | |
14 my $spec=''; | |
15 my $lookupCol=''; | |
16 | |
17 | |
18 GetOptions( | |
19 "log=s" => \$log, | |
20 "expfile=s" => \$data_in, | |
21 "cols=s" => \$geneCols, ##want to specify columns otherwise if user wants to preserve actual dates anywhere they'll get replaced | |
22 "resultsfile=s" => \$out_file, | |
23 "species=s" => \$spec, | |
24 "lookup=s" => \$lookupCol, ##this could be empty | |
25 # "h|help" => \$help | |
26 ) or pod2usage( -exitval => 2, -verbose => 2 ); | |
27 | |
28 | |
29 #check parameters and options | |
30 my $debug = scalar(@ARGV); | |
31 use IO::Handle; | |
32 open OUTPUT, '>',$log or die "cant open this file for OUTPUT: $log. Computer says: $!\n"; | |
33 open(my $results,'>',$out_file) or die "cannot open results file $out_file: $!\n"; | |
34 open(my $allexpr, "<", $data_in) or die "Cannot open input file $data_in: $!\n"; | |
35 my @Expression = <$allexpr>; | |
36 close($allexpr); | |
37 | |
38 | |
39 my @geneCols_ary = (split(',', $geneCols)); | |
40 my $numCols = scalar @geneCols_ary; | |
41 | |
42 if ($lookupCol) {print OUTPUT "User specified second identifier col for 1/2-Mar genes.\n\n";} ##DEBUG | |
43 | |
44 my $human_yes = 0; ##initialize human switch to 0 (default is mouse, otherwise need to convert symbol to uppercase) | |
45 if ($spec eq "human") { | |
46 $human_yes = 1; | |
47 } | |
48 my $current2ndLookup_noquotes; | |
49 my $current2ndLookup; | |
50 for (my $i=0; $i<scalar @Expression; $i++) { | |
51 my $tmp = scalar @Expression; | |
52 my @linetmp = split('\t', $Expression[$i]); | |
53 $linetmp[-1] = substr($linetmp[-1],0,-1); ##get rid of newline in last piece; will mess up matching | |
54 if ($lookupCol) { | |
55 ##NEED TO ACCOUNT FOR COMMA-DELIMITED LISTS | |
56 $current2ndLookup = $linetmp[$lookupCol-1]; ##This is 2nd gene identifier | |
57 $current2ndLookup =~ s/"//g; ##Remove quotes if they're there | |
58 my @stuff = split(',',$current2ndLookup); ##Need to consider comma-delim list (fairly common) | |
59 $current2ndLookup = $stuff[0]; ##First in list should be somewhere in lookup file | |
60 } | |
61 | |
62 for (my $j=0; $j<$numCols; $j++) { ##IF $LOOKUP THEN NUMCOLS WILL BE 1 AND ONLY ONE TIME THROUGH LOOP | |
63 my $currentGene = $linetmp[$geneCols_ary[$j]-1]; | |
64 $currentGene =~ s/"//g; ##Might have quotes here too | |
65 | |
66 my $match = qx(cat ./genesymbol_dateLUT.tab | awk '\$1 == "$currentGene"'); ##10-8-14 change | |
67 my $debugL = length $match; | |
68 my @matchAry; | |
69 if ($debugL>0) { ##FOUND IN THE FIRST LIST | |
70 @matchAry = split('\t',$match); | |
71 $match =~ s/ | |
72 //g; ##Try to fix the ^Ms at ends of lines | |
73 } else { ##CHECK IF THEY'RE 1-MAR OR 2-MAR: | |
74 if ($lookupCol) { | |
75 if ($human_yes == 1) { | |
76 $match = qx(cat ./Mar1_2_LUT_human.txt | awk '\$1 == "$currentGene"' | awk '\$2 == "$current2ndLookup"'); | |
77 } else { | |
78 $match = qx(cat ./Mar1_2_LUT_mouse.txt | awk '\$1 == "$currentGene"' | awk '\$2 == "$current2ndLookup"'); | |
79 } | |
80 @matchAry = split('\t',$match); | |
81 } | |
82 } | |
83 $debugL = length $match; | |
84 if ($debugL > 0) { | |
85 my $blah; | |
86 if ($human_yes == 1) { ##Replace date with gene symbol (2nd col in file) | |
87 $blah = uc substr($matchAry[-1],0,-1); ##SHOULD BE ALWAYS LAST THING IN THE ROW | |
88 $blah =~ s/ | |
89 //g; | |
90 $linetmp[$geneCols_ary[$j]-1] = $blah; ##SHOULD BE ALWAYS LAST THING IN THE ROW | |
91 print OUTPUT "Match found for $currentGene, replacing with $linetmp[$geneCols_ary[$j]-1]\n"; | |
92 } else { | |
93 $blah = substr($matchAry[-1],0,-1); | |
94 $blah =~ s/ | |
95 //g; | |
96 $linetmp[$geneCols_ary[$j]-1] = $blah; | |
97 print OUTPUT "Match found for $currentGene, replacing with $linetmp[$geneCols_ary[$j]-1]\n"; | |
98 } | |
99 | |
100 } else { | |
101 ##GIVE SOME OUTPUT TO HINT USER IF GENES ARE 1/2-MAR (REGARDLESS OF WHAT WAS CHOSEN). THIS WILL SLOW CODE DOWN THOUGH... | |
102 my $match_h = qx(cat ./Mar1_2_LUT_human.txt | awk '\$1 == "$currentGene"'); | |
103 my $match_m = qx(cat ./Mar1_2_LUT_mouse.txt | awk '\$1 == "$currentGene"'); | |
104 my $debugL_h = length $match_h; | |
105 my $debugL_m = length $match_m; | |
106 if ( ($debugL_h>0) || ($debugL_m>0) ) { ##We have a 1/2-Mar gene but can't fix it | |
107 print OUTPUT "In file is $currentGene. Cannot replace because "; | |
108 if ($lookupCol) { | |
109 print OUTPUT "second identifier, $current2ndLookup, is not in reference file.\n"; | |
110 } else { | |
111 print OUTPUT "no second identifier column was specified.\n"; | |
112 } | |
113 } | |
114 } | |
115 | |
116 | |
117 } | |
118 print $results join("\t",@linetmp),"\n"; | |
119 } | |
120 | |
121 close $results; | |
122 close OUTPUT; |