diff run-multi_join_serial.pl @ 2:3a9cc859f4c1 draft

Uploaded
author mir-bioinf
date Wed, 15 Apr 2015 14:43:04 -0400
parents
children 0aa0ebcd307c
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/run-multi_join_serial.pl	Wed Apr 15 14:43:04 2015 -0400
@@ -0,0 +1,196 @@
+#!/usr/bin/perl 
+
+
+use Getopt::Long;
+use Pod::Usage;
+use IO::File;
+use Data::Dumper;
+
+#require '/opt/galaxy/galaxy-dist/tools/ngs_rna/Unreleased/multi_join_shell.pl';  ##comment this line out when finished testing
+#require '/opt/galaxy/galaxy-dist/tools/ngs_rna/Unreleased/addColumnsFromFile2ToFile1.pl';
+#require '/opt/galaxy/galaxy-dist/tools/ngs_rna/Unreleased/GetOptWC.pm';
+
+GetOptions(
+    "log=s"                     => \$log,
+    "join_file=s"             => \$data_in,
+    "join_col=s"	     => \$coljoin,
+    "time"                   => \$mTime,
+    "q|quiet"                => \$quiet,
+    "iteration=i"	     => \$I,
+    "totalfiles=i"	     => \$N,
+    "with_header=s"	     => \$header_yes,
+    "input_name=s"	     => \$in_name,
+    "resultsfile=s"	     => \$out_file,
+#    "h|help"                 => \$help
+) or pod2usage( -exitval => 2, -verbose => 2 );
+
+
+#check parameters and options
+my $debug = scalar(@ARGV);
+
+$coljoin--;
+#pod2usage(-msg => "To troubleshoot. ARGV should be @ARGV with $debug arguments in it.");
+pod2usage(-msg => "Forward probability should be in [0, 1]!", -exitval => 2, -verbose => 2) if ($probF < 0 || $probF > 1);
+
+$N++;
+
+# #
+use IO::Handle;
+open OUTPUT, '>>',$log or die "cant open this file for OUTPUT: $log. Computer says: $!\n";;
+open ERROR,  '>>', $log  or die "cant open this file for ERROR: $log. Computer says: $!\n";
+STDOUT->fdopen( \*OUTPUT, 'a' ) or die "cant open file $!\n";   #cms changing mode from 'w' to 'a' for multiple files in one run
+STDERR->fdopen( \*ERROR,  'a' ) or die "cant open file $!\n";   #cms changing mode from 'w' to 'a' for multiple files in one run
+# # #
+
+my @options;
+
+my $fileno = $I + 1; 
+
+##Keeping track of the input files (one per iteration of this script) in an external file:
+open $Filenames, '>>', "temp_filenames.txt" or die "cannot open the temporary file $!\n";
+print $Filenames "$data_in\t";
+print $Filenames "$coljoin\n";
+
+if (($I==$N-1)&&($N>=2)) {
+        ## At the end of the last iteration
+	close($Filenames);
+        
+	print "\nLAST ITERATION COMPLETED and at least two input files provided.\n";
+
+	##Read in file temp_filenames.txt
+	open(my $tmpfile, "<", "temp_filenames.txt") or die "Cannot open temp file: $!";
+	my @fileArray = <$tmpfile>;
+	#unshift @fileArray,$conditions; ##don't need to do this since conditions aren't used here
+	close($tmpfile) or die "what is that??!!! $!";
+
+	
+	##Need to send output file name to shell script:
+	push @fileArray, $out_file;  ##adds out_file to the end of fileArray
+	##Also need to send yes/no for keeping header:
+	push @fileArray, $header_yes;
+
+	##Debug:
+	print "\nFirst file fileArray[0] is $fileArray[0].";
+	print "\nOutput file is next-to-last val in fileArray, $fileArray[-2].";
+	print "\nUse header? is last val in fileArray, $fileArray[-1].";
+	print "\nSecond file now is fileArray[2], $fileArray[2].";
+
+	##@fileArray has one file per line,output,header_yes, so $N+1 rows
+	my $f=0;
+	my @first;
+	my @second;
+	do {
+
+		@first = split('\t',$fileArray[$f]);  ##was filename\tJoinCol
+		print "\njoin column from first line is $first[1].";
+
+		##CMS DEALING WITH HEADER OR NOT:
+                if ($header_yes eq "no") {
+			my $fh1;
+                	$fh1 = IO::File->new("<$first[0]");
+			my $line1file1 = $fh1->getline();
+			$line1file1 =~ s/\s+$//;
+			#print "\nline1file1 is $line1file1\n"; ##DEBUG
+			@cols = split "\t",$line1file1;
+			my $numcols1 = @cols;
+			my $head1;
+			for (my $i=1; $i<$numcols1; $i++) {
+				$head1.="C$i\t";
+			}
+			$head1.="C$numcols1\n";
+			open(my $fh_sub, '>', './header1.txt') or die "OOPIES: $!\n";
+			print "\nheader first file $first[0] on next line:\n$head1"; ##DEBUG
+			print $fh_sub $head1;
+			close $fh_sub;
+			system("cat $first[0] >> ./header1.txt");  ##put header in front of file
+			##now want to use ./header1.txt instead of what was in $first[0] earlier
+			$first[0] = "./header1.txt";
+		}
+		
+
+
+		@second = split('\t',$fileArray[$f+1]);
+
+		if ($header_yes eq "no") {
+                        my $fh2;
+                        $fh2 = IO::File->new("<$second[0]");
+                        my $line1file2 = $fh2->getline();
+                        $line1file2 =~ s/\s+$//;
+                        @cols = split "\t",$line1file2;
+                        my $numcols2 = @cols;
+                        my $head2;
+                        for (my $i=1; $i<$numcols2; $i++) {
+                                $head2.="C$i\t";
+                        }
+                        $head2.="C$numcols2\n";
+                        open(my $fh_sub, '>', './header2.txt') or die "OOPIES: $!\n";
+                        print "\nheader from $second[0] on next line:\n$head2";  ##DEBUG
+			print $fh_sub $head2;
+                        close $fh_sub;
+			system("cat $second[0] >> ./header2.txt");
+			$second[0]="./header2.txt";
+                }
+
+		print "\ncommand following:\n";
+		print "/opt/galaxy/galaxy-dist/tools/ngs_rna/Unreleased/addColumnsFromFile2ToFile1.pl -File1=$first[0] -File2=$second[0] -cola1=$first[1] -cola2=$second[1] -colb1=$first[1] -colb2=$second[1]";
+		system("/opt/galaxy/galaxy-dist/tools/ngs_rna/Unreleased/addColumnsFromFile2ToFile1.pl", "-File1=$first[0]", "-File2=$second[0]", "-cola1=$first[1]", "-cola2=$second[1]", "-colb1=$first[1]", "-colb2=$second[1]");
+		print "\nOut from system call on next line:\n$!";
+		$f+=2;
+		system("mv file1_file2.txt joined.txt");
+		if ($header_yes eq "no") {
+			system("rm ./header2.txt");
+			system("rm ./header1.txt");
+		}
+	} while ($f < 2);  ##FIRST TWO ONLY!!!
+	
+	for ($f; $f<$N; $f++) {
+                my @current = split('\t',$fileArray[$f]);  ##was filename\tJoinCol
+                print "\njoin column from first line is $first[1].";
+
+		if ($header_yes eq "no") {
+                        my $fh;
+                        $fh = IO::File->new("<$current[0]");
+                        my $line1file = $fh->getline();
+                        $line1file =~ s/\s+$//;
+                        @cols = split "\t",$line1file;
+                        my $numcols = @cols;
+                        my $head;
+                        for (my $i=1; $i<$numcols; $i++) {
+                                $head.="C$i\t";
+                        }
+                        $head.="C$numcols\n";
+                        open(my $fh_sub, '>', './header.txt') or die "OOPIES: $!\n";
+			print "\nheader from file $current[0] on next line:\n$head";  ##DEBUG
+                        print $fh_sub $head;
+                        close $fh_sub;
+                        system("cat $current[0] >> ./header.txt");
+                        $current[0]="./header.txt";
+                }
+
+		print "\ncommand following:\n";
+		print "/opt/galaxy/galaxy-dist/tools/ngs_rna/Unreleased/addColumnsFromFile2ToFile1.pl -File1=joined.txt -File2=$current[0] -cola1=$first[1] -cola2=$current[1] -colb1=$first[1] -colb2=$current[1]";
+                system("/opt/galaxy/galaxy-dist/tools/ngs_rna/Unreleased/addColumnsFromFile2ToFile1.pl","-File1=joined.txt", "-File2=$current[0]", "-cola1=$first[1]", "-cola2=$current[1]", "-colb1=$first[1]", "-colb2=$current[1]");
+                print "\nOut from system call on next line:\n$!";
+		system("mv file1_file2.txt joined.txt");
+		if ($header_yes eq "no") {
+			system("rm ./header.txt");
+		}
+        }
+	
+	system("mv joined.txt $fileArray[-2]");
+	
+	##NOT SURE WHAT TO DO WITH THIS FOR THE MULTI-JOIN TOOL:
+	##Now, make the EC files from the genes-results files (extract appropriate columns):
+	#my $condStr =  multi_join_shell(@fileArray);  ##RSEMgetTPMs needs to take care of carriage returns
+	##NEED TO MODIFY RSEMTOEBSEQ_SHELL SO IT TAKES THE OUTPUT FILENAME AS WELL	
+
+	system("rm temp_filenames.txt");
+
+}
+elsif ($N<2) {
+	print "\n<br /><i>Only one file; not running join.</i>\n";
+}
+
+
+
+#print "LOG $mv\n";