view copyNextSeq.pl @ 0:d4ac6e05c96c default tip

initial commit
author Yusuf Ali <ali@yusuf.email>
date Wed, 25 Mar 2015 13:43:47 -0600
parents
children
line wrap: on
line source

#!/usr/bin/perl
use strict;
use warnings;
use Getopt::Long;
use File::Find;
use File::Basename;
use vars qw(@fastq_files);

my $dirname = dirname(__FILE__);
my $pythonScript = "$dirname/rgFastQC.py";
my $tool_dir = shift @ARGV;
my $pythonJars = "$tool_dir/shared/jars/FastQC/fastqc";

# Site config
my $num_threads = 32;
my $fastq_sample_size = 400000;
my $seq_host = "10.81.192.138";
my $seq_username = "nextseq-user";
my $seq_dir = "Desktop/Share";

#get localdir
if(not -e "$tool_dir/transfer_convert_nextseq.loc"){
  system("cat $dirname/tool-data/transfer_convert_nextseq.loc > $tool_dir/transfer_convert_nextseq.loc");
}
open FILE, "$tool_dir/transfer_convert_nextseq.loc" or die "Could not open configuration file: $!\n";
my @keys = split("=",<FILE>);
(my $local_dir = $keys[$#keys]) =~s/\s+//g;
close FILE;

# store arguments into variables
my $runName;
my $sampleSheet;
my $user;
my $accessFile;
my $outDir;
my $htmlFile;
my $archiveFile;

GetOptions ("run=s" => \$runName,
            "samplesheet=s" => \$sampleSheet,
	    "user=s" => \$user,
	    "toolDir=s" => \$accessFile,
	    "out=s" => \$outDir,
	    "html=s" => \$htmlFile,
            "archive=s" => \$archiveFile);

if(not defined $runName or not defined $sampleSheet or not defined $user or not defined $accessFile or not defined $outDir or not defined $htmlFile){
  die "Usage: $0 -run <unique_suffix> -samplesheet <illumina.csv> -user <user\@domain in nextseq_access.conf> -toolDir <galaxy tool conf dir> ",
      "-out <output dir for FASTQC report> -html <FASTQC report file name> -archive <SAV files.zip>\n";
}

$accessFile = "$accessFile/nextseq_access.conf";

# create access file if not already there
my $command = `touch $accessFile`;
open my $handle, '<', "$accessFile";
chomp(my @allowed_users = <$handle>);

$runName = quotemeta($runName);

my ($out_file, $out_path, $out_ext ) = fileparse( $htmlFile, "\.[^.]*" );

# check to make sure $user is allowed to run script
if (! ($user ~~ @allowed_users) ){
	die "Please ask the administrator to add $user to $accessFile in order to gain access to this tool\n";
}

# First, sanity check the sample file
open(CSV, $sampleSheet)
  or die "Cannot open $sampleSheet for reading: $!\n";
undef $/; # slurp up whole file at once by undefining record separator
my @CSV = split /\r?\n/, <CSV>; # allow different endings
close(CSV);
$/="\n"; # restore normal per-line reading
my ($has_header, $has_reads, $has_data);
for(@CSV){
  if(/^\[Header\]/){
    $has_header = 1;
  }
  elsif(/^\[Reads\]/){
    $has_reads = 1;
  }
  elsif(/^\[Data\]/){
    $has_data = 1;
  }
}
if(not defined $has_header){
	die "Header section is missing in sample sheet, please fix and resubmit this job\n";
}
if(not defined $has_reads){
	die "Reads section is missing in sample sheet, please fix and resubmit this job\n";
}
if(not defined $has_data){
	die "Data section is missing in sample sheet, please fix and resubmit this job\n";
}

# Expand the catridge ID into the full run name on the remote host, input should look something like "H35VJBGXX"
open(SSH, "ssh $seq_username\@$seq_host ls -1 $seq_dir |")
  or die "Could not run ssh login to $seq_host: $!\n";
my @matchOptions;
my @mismatchOptions;
while(<SSH>){
	chomp;
	if(/$runName/o){
		push @matchOptions, $_;	
	}
	else{
		push @mismatchOptions, $_;
	}
}
close(SSH);
if(not @matchOptions){
	if(not @mismatchOptions){
		die "There was no data found on the rempote server at all, please ask the administrator to ",
		    "check this tool's setup (currently checking $seq_username\@$seq_host:$seq_dir)\n";
	}
	# Keep only the ones not already uploaded as options
	@mismatchOptions = grep {not -e "$local_dir/$_"} @mismatchOptions;
	die "No run folder matching $runName was found at $seq_username\@$seq_host:$seq_dir, please try with another ",
	    "run name. The following would work currently: ", join(", ", @mismatchOptions), "\n";  
}
elsif(@matchOptions > 1){
	die "Ambiguous run name specification, please revise \"$runName\" to distinguish between existing datasets: ",
	    join(", ", @matchOptions), "\n"; 
}
my $expandedRunName = $matchOptions[0]; # unambiguous, so proceed

# if sample already exits as a folder, die
if(-e "$local_dir/$expandedRunName"){
#	die "Run $expandedRunName already exists on galaxy ($local_dir/$expandedRunName), cannot copy over\n";
}
# if not, copy to folder
else{
#	system("scp -r $seq_username\@$seq_host\:$seq_dir/$expandedRunName $local_dir") >> 8 and die "Failed to copy from $seq_host to galaxy: scp exit status $?\n";
}

# Put the sample sheet where it needs to be with the transfered data
open(CSV, ">$local_dir/$expandedRunName/SampleSheet.csv")
  or die "Cannot open $local_dir/$expandedRunName/SampleSheet.csv for writing: $!\nThe data files have been transfered, but no BCL to FASTQ conversion has taken place.\n";
print CSV join("\n", @CSV);
close(CSV);

# convert bcl files to fastq
#system("cd $local_dir/$expandedRunName; /export/common/programs/bcl2fastq/bin/bcl2fastq -r $num_threads -d $num_threads -p $num_threads -w $num_threads")>>8
#  and die "BCL to FASTQ conversion had non-zero exit status ($?). The BCL files were transfered, but FASTQ files were not generated.\n";

# Find the FASTQ files generated
find(sub{push @fastq_files, $File::Find::name if /\.fastq.gz$/}, "$local_dir/$expandedRunName");

# Run FASTQC on sample of data from each lane/barcode
# open output file and write html
open(OUTFILE, ">$htmlFile")
  or die "Cannot open $htmlFile for writing: $!\n";
print OUTFILE "<html><body><h1>Barcodes</h1>";
system("mkdir -p $outDir");

# generate html plot using python tool
$SIG{'PIPE'} = 'IGNORE'; 
my $cwd = dirname(__FILE__);
foreach my $file (@fastq_files){
	my ($barcode, $path, $ext ) = fileparse( $file, "\.fastq\.gz" );
	my $cmd = "gzip -cd $file | head -n $fastq_sample_size | python $pythonScript -i /dev/stdin "
. "-d $outDir/$barcode/. "
. "-o fastqc_report.html "
. "-n \"FASTQC $barcode\" "
. "-f \"FASTQ\" "
. "-j \"$barcode$ext\" "
. "-e $pythonJars"; 
	# Assumes the bash shell is being used
	open(CMD, "trap '' SIGPIPE; $cmd 2| grep -v \"Broken pipe\" |")
          or die "Cannot run FASTQC: $!\n";
        while(<CMD>){
          # Can safely ignore blank lines and SIGPIPE warnings
          next if /^\s*$/ or /Broken pipe/; 
          print STDERR $_; # forward any other errors
        }
        close(CMD);
	system("perl -i.bak -pe \"s/>FastQC Report</>FastQC Report<div><a href='..\\/index.html'>Back to Table of Contents<\\\/a><\\\/div></;s/Images|Icons/./\" $outDir/$barcode/fastqc_report.html");
        system("unzip -o -d $outDir/$barcode -qq -j $outDir/$barcode/$barcode\_fastqc.zip $barcode\_fastqc/Icons/*.png");
	# append to html file
	print OUTFILE "<div><a href='$barcode/fastqc_report.html'>$barcode</a></div>";
}


print OUTFILE "</body></html>";
close(OUTFILE);
system("cp $htmlFile $outDir/index.html");
system("cd $local_dir/$expandedRunName; rm $archiveFile; zip -r $archiveFile RunInfo.xml RunParameters.xml InterOp -q");