Mercurial > repos > mir-bioinf > multi_join_left
changeset 3:0aa0ebcd307c draft
Uploaded
| author | mir-bioinf | 
|---|---|
| date | Wed, 15 Apr 2015 16:31:04 -0400 | 
| parents | 3a9cc859f4c1 | 
| children | 46c880ae6db2 | 
| files | multi_join_serial.xml run-multi_join_serial.pl | 
| diffstat | 2 files changed, 2 insertions(+), 33 deletions(-) [+] | 
line wrap: on
 line diff
--- a/multi_join_serial.xml Wed Apr 15 14:43:04 2015 -0400 +++ b/multi_join_serial.xml Wed Apr 15 16:31:04 2015 -0400 @@ -7,7 +7,7 @@ #end for #for $i, $s in enumerate( $Files ) - /opt/galaxy/galaxy-dist/tools/ngs_rna/Unreleased/run-multi_join_serial.pl --join_file $s.joinMe --join_col $s.joinCol --iteration $i --totalfiles $j --with_header $headerYes --resultsfile $Joined_all --log $log + /opt/galaxy/galaxy-dist/tools/ngs_rna/Unreleased/run-multi_join_serial.pl --join_file $s.joinMe --join_col $s.joinCol --iteration $i --totalfiles $j --with_header $headerYes --resultsfile $Joined_all ##print "loop iteration $i.\n"; ; #end for @@ -24,7 +24,6 @@ </inputs> <outputs> <data format="tabular" name="Joined_all" label="Multi-Join result"/> - <data format="txt" name="log" label="debug_info"/> </outputs> <tests> <test> @@ -36,18 +35,12 @@ <param name="Files_2joinCol" value="2"/> <param name="headerYes" value="yes"/> <output name="Joined_all" value="multi_join_serial_out.tab" ftype="tabular"/> - <output name="log" value="multi_join_serial_debug.txt" ftype="tabular"/> <test/> <tests/> <help> This tool performs a left-outer join on multiple (at least two) files using a perl script that Ron wrote (thanks, Ron!). The resulting joined file will have the same number of rows as the first file chosen and subsequent files' matches will be shown if present. Rows in the first file without matches in the other files will have empty cells. If none of the input files have a header present, a simple column number header will be added to the output file to denote the start of each set of matches (from each file, start denoted by "C1"). -To convert from left-outer join result to inner join result (only include rows in common to all datasets), run Filter out rows and columns with non-numeric values tool with the following options selected (last 3 options, all are drop-down select menus): - 1. Replace/remove: Empty only - 2. Remove entire column or row (leave default) - 3. Remove non-numeric/empty cell-containing ROWS from dataset - .. class:: warningmark @@ -58,7 +51,6 @@ 1. Click Add new File for each tab-delimited file you'd like to add and the column you want to join on. 2. After adding all files to join, select whether the headers should all be preserved (this should be Yes if all input datasets have headers). 3. Click Execute. - 4. Please report any issues and/or suggestions to Christy. -----
--- a/run-multi_join_serial.pl Wed Apr 15 14:43:04 2015 -0400 +++ b/run-multi_join_serial.pl Wed Apr 15 16:31:04 2015 -0400 @@ -11,7 +11,6 @@ #require '/opt/galaxy/galaxy-dist/tools/ngs_rna/Unreleased/GetOptWC.pm'; GetOptions( - "log=s" => \$log, "join_file=s" => \$data_in, "join_col=s" => \$coljoin, "time" => \$mTime, @@ -36,8 +35,6 @@ # # use IO::Handle; -open OUTPUT, '>>',$log or die "cant open this file for OUTPUT: $log. Computer says: $!\n";; -open ERROR, '>>', $log or die "cant open this file for ERROR: $log. Computer says: $!\n"; STDOUT->fdopen( \*OUTPUT, 'a' ) or die "cant open file $!\n"; #cms changing mode from 'w' to 'a' for multiple files in one run STDERR->fdopen( \*ERROR, 'a' ) or die "cant open file $!\n"; #cms changing mode from 'w' to 'a' for multiple files in one run # # # @@ -55,7 +52,6 @@ ## At the end of the last iteration close($Filenames); - print "\nLAST ITERATION COMPLETED and at least two input files provided.\n"; ##Read in file temp_filenames.txt open(my $tmpfile, "<", "temp_filenames.txt") or die "Cannot open temp file: $!"; @@ -69,11 +65,6 @@ ##Also need to send yes/no for keeping header: push @fileArray, $header_yes; - ##Debug: - print "\nFirst file fileArray[0] is $fileArray[0]."; - print "\nOutput file is next-to-last val in fileArray, $fileArray[-2]."; - print "\nUse header? is last val in fileArray, $fileArray[-1]."; - print "\nSecond file now is fileArray[2], $fileArray[2]."; ##@fileArray has one file per line,output,header_yes, so $N+1 rows my $f=0; @@ -82,7 +73,6 @@ do { @first = split('\t',$fileArray[$f]); ##was filename\tJoinCol - print "\njoin column from first line is $first[1]."; ##CMS DEALING WITH HEADER OR NOT: if ($header_yes eq "no") { @@ -90,7 +80,6 @@ $fh1 = IO::File->new("<$first[0]"); my $line1file1 = $fh1->getline(); $line1file1 =~ s/\s+$//; - #print "\nline1file1 is $line1file1\n"; ##DEBUG @cols = split "\t",$line1file1; my $numcols1 = @cols; my $head1; @@ -99,7 +88,6 @@ } $head1.="C$numcols1\n"; open(my $fh_sub, '>', './header1.txt') or die "OOPIES: $!\n"; - print "\nheader first file $first[0] on next line:\n$head1"; ##DEBUG print $fh_sub $head1; close $fh_sub; system("cat $first[0] >> ./header1.txt"); ##put header in front of file @@ -123,18 +111,14 @@ $head2.="C$i\t"; } $head2.="C$numcols2\n"; - open(my $fh_sub, '>', './header2.txt') or die "OOPIES: $!\n"; - print "\nheader from $second[0] on next line:\n$head2"; ##DEBUG + open(my $fh_sub, '>', './header2.txt') or die "ERROR: $!\n"; print $fh_sub $head2; close $fh_sub; system("cat $second[0] >> ./header2.txt"); $second[0]="./header2.txt"; } - print "\ncommand following:\n"; - print "/opt/galaxy/galaxy-dist/tools/ngs_rna/Unreleased/addColumnsFromFile2ToFile1.pl -File1=$first[0] -File2=$second[0] -cola1=$first[1] -cola2=$second[1] -colb1=$first[1] -colb2=$second[1]"; system("/opt/galaxy/galaxy-dist/tools/ngs_rna/Unreleased/addColumnsFromFile2ToFile1.pl", "-File1=$first[0]", "-File2=$second[0]", "-cola1=$first[1]", "-cola2=$second[1]", "-colb1=$first[1]", "-colb2=$second[1]"); - print "\nOut from system call on next line:\n$!"; $f+=2; system("mv file1_file2.txt joined.txt"); if ($header_yes eq "no") { @@ -145,7 +129,6 @@ for ($f; $f<$N; $f++) { my @current = split('\t',$fileArray[$f]); ##was filename\tJoinCol - print "\njoin column from first line is $first[1]."; if ($header_yes eq "no") { my $fh; @@ -160,17 +143,13 @@ } $head.="C$numcols\n"; open(my $fh_sub, '>', './header.txt') or die "OOPIES: $!\n"; - print "\nheader from file $current[0] on next line:\n$head"; ##DEBUG print $fh_sub $head; close $fh_sub; system("cat $current[0] >> ./header.txt"); $current[0]="./header.txt"; } - print "\ncommand following:\n"; - print "/opt/galaxy/galaxy-dist/tools/ngs_rna/Unreleased/addColumnsFromFile2ToFile1.pl -File1=joined.txt -File2=$current[0] -cola1=$first[1] -cola2=$current[1] -colb1=$first[1] -colb2=$current[1]"; system("/opt/galaxy/galaxy-dist/tools/ngs_rna/Unreleased/addColumnsFromFile2ToFile1.pl","-File1=joined.txt", "-File2=$current[0]", "-cola1=$first[1]", "-cola2=$current[1]", "-colb1=$first[1]", "-colb2=$current[1]"); - print "\nOut from system call on next line:\n$!"; system("mv file1_file2.txt joined.txt"); if ($header_yes eq "no") { system("rm ./header.txt"); @@ -188,9 +167,7 @@ } elsif ($N<2) { - print "\n<br /><i>Only one file; not running join.</i>\n"; } -#print "LOG $mv\n";
