comparison multi_join_serial/run-multi_join_serial.pl @ 0:1b7d0d2a3543 draft

Uploaded
author mir-bioinf
date Wed, 15 Apr 2015 14:23:56 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1b7d0d2a3543
1 #!/usr/bin/perl
2
3
4 use Getopt::Long;
5 use Pod::Usage;
6 use IO::File;
7 use Data::Dumper;
8
9 #require '/opt/galaxy/galaxy-dist/tools/ngs_rna/Unreleased/multi_join_shell.pl'; ##comment this line out when finished testing
10 #require '/opt/galaxy/galaxy-dist/tools/ngs_rna/Unreleased/addColumnsFromFile2ToFile1.pl';
11 #require '/opt/galaxy/galaxy-dist/tools/ngs_rna/Unreleased/GetOptWC.pm';
12
13 GetOptions(
14 "log=s" => \$log,
15 "join_file=s" => \$data_in,
16 "join_col=s" => \$coljoin,
17 "time" => \$mTime,
18 "q|quiet" => \$quiet,
19 "iteration=i" => \$I,
20 "totalfiles=i" => \$N,
21 "with_header=s" => \$header_yes,
22 "input_name=s" => \$in_name,
23 "resultsfile=s" => \$out_file,
24 # "h|help" => \$help
25 ) or pod2usage( -exitval => 2, -verbose => 2 );
26
27
28 #check parameters and options
29 my $debug = scalar(@ARGV);
30
31 $coljoin--;
32 #pod2usage(-msg => "To troubleshoot. ARGV should be @ARGV with $debug arguments in it.");
33 pod2usage(-msg => "Forward probability should be in [0, 1]!", -exitval => 2, -verbose => 2) if ($probF < 0 || $probF > 1);
34
35 $N++;
36
37 # #
38 use IO::Handle;
39 open OUTPUT, '>>',$log or die "cant open this file for OUTPUT: $log. Computer says: $!\n";;
40 open ERROR, '>>', $log or die "cant open this file for ERROR: $log. Computer says: $!\n";
41 STDOUT->fdopen( \*OUTPUT, 'a' ) or die "cant open file $!\n"; #cms changing mode from 'w' to 'a' for multiple files in one run
42 STDERR->fdopen( \*ERROR, 'a' ) or die "cant open file $!\n"; #cms changing mode from 'w' to 'a' for multiple files in one run
43 # # #
44
45 my @options;
46
47 my $fileno = $I + 1;
48
49 ##Keeping track of the input files (one per iteration of this script) in an external file:
50 open $Filenames, '>>', "temp_filenames.txt" or die "cannot open the temporary file $!\n";
51 print $Filenames "$data_in\t";
52 print $Filenames "$coljoin\n";
53
54 if (($I==$N-1)&&($N>=2)) {
55 ## At the end of the last iteration
56 close($Filenames);
57
58 print "\nLAST ITERATION COMPLETED and at least two input files provided.\n";
59
60 ##Read in file temp_filenames.txt
61 open(my $tmpfile, "<", "temp_filenames.txt") or die "Cannot open temp file: $!";
62 my @fileArray = <$tmpfile>;
63 #unshift @fileArray,$conditions; ##don't need to do this since conditions aren't used here
64 close($tmpfile) or die "what is that??!!! $!";
65
66
67 ##Need to send output file name to shell script:
68 push @fileArray, $out_file; ##adds out_file to the end of fileArray
69 ##Also need to send yes/no for keeping header:
70 push @fileArray, $header_yes;
71
72 ##Debug:
73 print "\nFirst file fileArray[0] is $fileArray[0].";
74 print "\nOutput file is next-to-last val in fileArray, $fileArray[-2].";
75 print "\nUse header? is last val in fileArray, $fileArray[-1].";
76 print "\nSecond file now is fileArray[2], $fileArray[2].";
77
78 ##@fileArray has one file per line,output,header_yes, so $N+1 rows
79 my $f=0;
80 my @first;
81 my @second;
82 do {
83
84 @first = split('\t',$fileArray[$f]); ##was filename\tJoinCol
85 print "\njoin column from first line is $first[1].";
86
87 ##CMS DEALING WITH HEADER OR NOT:
88 if ($header_yes eq "no") {
89 my $fh1;
90 $fh1 = IO::File->new("<$first[0]");
91 my $line1file1 = $fh1->getline();
92 $line1file1 =~ s/\s+$//;
93 #print "\nline1file1 is $line1file1\n"; ##DEBUG
94 @cols = split "\t",$line1file1;
95 my $numcols1 = @cols;
96 my $head1;
97 for (my $i=1; $i<$numcols1; $i++) {
98 $head1.="C$i\t";
99 }
100 $head1.="C$numcols1\n";
101 open(my $fh_sub, '>', './header1.txt') or die "OOPIES: $!\n";
102 print "\nheader first file $first[0] on next line:\n$head1"; ##DEBUG
103 print $fh_sub $head1;
104 close $fh_sub;
105 system("cat $first[0] >> ./header1.txt"); ##put header in front of file
106 ##now want to use ./header1.txt instead of what was in $first[0] earlier
107 $first[0] = "./header1.txt";
108 }
109
110
111
112 @second = split('\t',$fileArray[$f+1]);
113
114 if ($header_yes eq "no") {
115 my $fh2;
116 $fh2 = IO::File->new("<$second[0]");
117 my $line1file2 = $fh2->getline();
118 $line1file2 =~ s/\s+$//;
119 @cols = split "\t",$line1file2;
120 my $numcols2 = @cols;
121 my $head2;
122 for (my $i=1; $i<$numcols2; $i++) {
123 $head2.="C$i\t";
124 }
125 $head2.="C$numcols2\n";
126 open(my $fh_sub, '>', './header2.txt') or die "OOPIES: $!\n";
127 print "\nheader from $second[0] on next line:\n$head2"; ##DEBUG
128 print $fh_sub $head2;
129 close $fh_sub;
130 system("cat $second[0] >> ./header2.txt");
131 $second[0]="./header2.txt";
132 }
133
134 print "\ncommand following:\n";
135 print "/opt/galaxy/galaxy-dist/tools/ngs_rna/Unreleased/addColumnsFromFile2ToFile1.pl -File1=$first[0] -File2=$second[0] -cola1=$first[1] -cola2=$second[1] -colb1=$first[1] -colb2=$second[1]";
136 system("/opt/galaxy/galaxy-dist/tools/ngs_rna/Unreleased/addColumnsFromFile2ToFile1.pl", "-File1=$first[0]", "-File2=$second[0]", "-cola1=$first[1]", "-cola2=$second[1]", "-colb1=$first[1]", "-colb2=$second[1]");
137 print "\nOut from system call on next line:\n$!";
138 $f+=2;
139 system("mv file1_file2.txt joined.txt");
140 if ($header_yes eq "no") {
141 system("rm ./header2.txt");
142 system("rm ./header1.txt");
143 }
144 } while ($f < 2); ##FIRST TWO ONLY!!!
145
146 for ($f; $f<$N; $f++) {
147 my @current = split('\t',$fileArray[$f]); ##was filename\tJoinCol
148 print "\njoin column from first line is $first[1].";
149
150 if ($header_yes eq "no") {
151 my $fh;
152 $fh = IO::File->new("<$current[0]");
153 my $line1file = $fh->getline();
154 $line1file =~ s/\s+$//;
155 @cols = split "\t",$line1file;
156 my $numcols = @cols;
157 my $head;
158 for (my $i=1; $i<$numcols; $i++) {
159 $head.="C$i\t";
160 }
161 $head.="C$numcols\n";
162 open(my $fh_sub, '>', './header.txt') or die "OOPIES: $!\n";
163 print "\nheader from file $current[0] on next line:\n$head"; ##DEBUG
164 print $fh_sub $head;
165 close $fh_sub;
166 system("cat $current[0] >> ./header.txt");
167 $current[0]="./header.txt";
168 }
169
170 print "\ncommand following:\n";
171 print "/opt/galaxy/galaxy-dist/tools/ngs_rna/Unreleased/addColumnsFromFile2ToFile1.pl -File1=joined.txt -File2=$current[0] -cola1=$first[1] -cola2=$current[1] -colb1=$first[1] -colb2=$current[1]";
172 system("/opt/galaxy/galaxy-dist/tools/ngs_rna/Unreleased/addColumnsFromFile2ToFile1.pl","-File1=joined.txt", "-File2=$current[0]", "-cola1=$first[1]", "-cola2=$current[1]", "-colb1=$first[1]", "-colb2=$current[1]");
173 print "\nOut from system call on next line:\n$!";
174 system("mv file1_file2.txt joined.txt");
175 if ($header_yes eq "no") {
176 system("rm ./header.txt");
177 }
178 }
179
180 system("mv joined.txt $fileArray[-2]");
181
182 ##NOT SURE WHAT TO DO WITH THIS FOR THE MULTI-JOIN TOOL:
183 ##Now, make the EC files from the genes-results files (extract appropriate columns):
184 #my $condStr = multi_join_shell(@fileArray); ##RSEMgetTPMs needs to take care of carriage returns
185 ##NEED TO MODIFY RSEMTOEBSEQ_SHELL SO IT TAKES THE OUTPUT FILENAME AS WELL
186
187 system("rm temp_filenames.txt");
188
189 }
190 elsif ($N<2) {
191 print "\n<br /><i>Only one file; not running join.</i>\n";
192 }
193
194
195
196 #print "LOG $mv\n";