changeset 0:d4ac6e05c96c default tip

initial commit
author Yusuf Ali <ali@yusuf.email>
date Wed, 25 Mar 2015 13:43:47 -0600
parents
children
files copyNextSeq.pl rgFastQC.py tool-data/transfer_convert_nextseq.loc transfer_convert_nextseq.xml
diffstat 4 files changed, 434 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/copyNextSeq.pl	Wed Mar 25 13:43:47 2015 -0600
@@ -0,0 +1,188 @@
+#!/usr/bin/perl
+use strict;
+use warnings;
+use Getopt::Long;
+use File::Find;
+use File::Basename;
+use vars qw(@fastq_files);
+
+my $dirname = dirname(__FILE__);
+my $pythonScript = "$dirname/rgFastQC.py";
+my $tool_dir = shift @ARGV;
+my $pythonJars = "$tool_dir/shared/jars/FastQC/fastqc";
+
+# Site config
+my $num_threads = 32;
+my $fastq_sample_size = 400000;
+my $seq_host = "10.81.192.138";
+my $seq_username = "nextseq-user";
+my $seq_dir = "Desktop/Share";
+
+#get localdir
+if(not -e "$tool_dir/transfer_convert_nextseq.loc"){
+  system("cat $dirname/tool-data/transfer_convert_nextseq.loc > $tool_dir/transfer_convert_nextseq.loc");
+}
+open FILE, "$tool_dir/transfer_convert_nextseq.loc" or die "Could not open configuration file: $!\n";
+my @keys = split("=",<FILE>);
+(my $local_dir = $keys[$#keys]) =~s/\s+//g;
+close FILE;
+
+# store arguments into variables
+my $runName;
+my $sampleSheet;
+my $user;
+my $accessFile;
+my $outDir;
+my $htmlFile;
+my $archiveFile;
+
+GetOptions ("run=s" => \$runName,
+            "samplesheet=s" => \$sampleSheet,
+	    "user=s" => \$user,
+	    "toolDir=s" => \$accessFile,
+	    "out=s" => \$outDir,
+	    "html=s" => \$htmlFile,
+            "archive=s" => \$archiveFile);
+
+if(not defined $runName or not defined $sampleSheet or not defined $user or not defined $accessFile or not defined $outDir or not defined $htmlFile){
+  die "Usage: $0 -run <unique_suffix> -samplesheet <illumina.csv> -user <user\@domain in nextseq_access.conf> -toolDir <galaxy tool conf dir> ",
+      "-out <output dir for FASTQC report> -html <FASTQC report file name> -archive <SAV files.zip>\n";
+}
+
+$accessFile = "$accessFile/nextseq_access.conf";
+
+# create access file if not already there
+my $command = `touch $accessFile`;
+open my $handle, '<', "$accessFile";
+chomp(my @allowed_users = <$handle>);
+
+$runName = quotemeta($runName);
+
+my ($out_file, $out_path, $out_ext ) = fileparse( $htmlFile, "\.[^.]*" );
+
+# check to make sure $user is allowed to run script
+if (! ($user ~~ @allowed_users) ){
+	die "Please ask the administrator to add $user to $accessFile in order to gain access to this tool\n";
+}
+
+# First, sanity check the sample file
+open(CSV, $sampleSheet)
+  or die "Cannot open $sampleSheet for reading: $!\n";
+undef $/; # slurp up whole file at once by undefining record separator
+my @CSV = split /\r?\n/, <CSV>; # allow different endings
+close(CSV);
+$/="\n"; # restore normal per-line reading
+my ($has_header, $has_reads, $has_data);
+for(@CSV){
+  if(/^\[Header\]/){
+    $has_header = 1;
+  }
+  elsif(/^\[Reads\]/){
+    $has_reads = 1;
+  }
+  elsif(/^\[Data\]/){
+    $has_data = 1;
+  }
+}
+if(not defined $has_header){
+	die "Header section is missing in sample sheet, please fix and resubmit this job\n";
+}
+if(not defined $has_reads){
+	die "Reads section is missing in sample sheet, please fix and resubmit this job\n";
+}
+if(not defined $has_data){
+	die "Data section is missing in sample sheet, please fix and resubmit this job\n";
+}
+
+# Expand the catridge ID into the full run name on the remote host, input should look something like "H35VJBGXX"
+open(SSH, "ssh $seq_username\@$seq_host ls -1 $seq_dir |")
+  or die "Could not run ssh login to $seq_host: $!\n";
+my @matchOptions;
+my @mismatchOptions;
+while(<SSH>){
+	chomp;
+	if(/$runName/o){
+		push @matchOptions, $_;	
+	}
+	else{
+		push @mismatchOptions, $_;
+	}
+}
+close(SSH);
+if(not @matchOptions){
+	if(not @mismatchOptions){
+		die "There was no data found on the rempote server at all, please ask the administrator to ",
+		    "check this tool's setup (currently checking $seq_username\@$seq_host:$seq_dir)\n";
+	}
+	# Keep only the ones not already uploaded as options
+	@mismatchOptions = grep {not -e "$local_dir/$_"} @mismatchOptions;
+	die "No run folder matching $runName was found at $seq_username\@$seq_host:$seq_dir, please try with another ",
+	    "run name. The following would work currently: ", join(", ", @mismatchOptions), "\n";  
+}
+elsif(@matchOptions > 1){
+	die "Ambiguous run name specification, please revise \"$runName\" to distinguish between existing datasets: ",
+	    join(", ", @matchOptions), "\n"; 
+}
+my $expandedRunName = $matchOptions[0]; # unambiguous, so proceed
+
+# if sample already exits as a folder, die
+if(-e "$local_dir/$expandedRunName"){
+#	die "Run $expandedRunName already exists on galaxy ($local_dir/$expandedRunName), cannot copy over\n";
+}
+# if not, copy to folder
+else{
+#	system("scp -r $seq_username\@$seq_host\:$seq_dir/$expandedRunName $local_dir") >> 8 and die "Failed to copy from $seq_host to galaxy: scp exit status $?\n";
+}
+
+# Put the sample sheet where it needs to be with the transfered data
+open(CSV, ">$local_dir/$expandedRunName/SampleSheet.csv")
+  or die "Cannot open $local_dir/$expandedRunName/SampleSheet.csv for writing: $!\nThe data files have been transfered, but no BCL to FASTQ conversion has taken place.\n";
+print CSV join("\n", @CSV);
+close(CSV);
+
+# convert bcl files to fastq
+#system("cd $local_dir/$expandedRunName; /export/common/programs/bcl2fastq/bin/bcl2fastq -r $num_threads -d $num_threads -p $num_threads -w $num_threads")>>8
+#  and die "BCL to FASTQ conversion had non-zero exit status ($?). The BCL files were transfered, but FASTQ files were not generated.\n";
+
+# Find the FASTQ files generated
+find(sub{push @fastq_files, $File::Find::name if /\.fastq.gz$/}, "$local_dir/$expandedRunName");
+
+# Run FASTQC on sample of data from each lane/barcode
+# open output file and write html
+open(OUTFILE, ">$htmlFile")
+  or die "Cannot open $htmlFile for writing: $!\n";
+print OUTFILE "<html><body><h1>Barcodes</h1>";
+system("mkdir -p $outDir");
+
+# generate html plot using python tool
+$SIG{'PIPE'} = 'IGNORE'; 
+my $cwd = dirname(__FILE__);
+foreach my $file (@fastq_files){
+	my ($barcode, $path, $ext ) = fileparse( $file, "\.fastq\.gz" );
+	my $cmd = "gzip -cd $file | head -n $fastq_sample_size | python $pythonScript -i /dev/stdin "
+. "-d $outDir/$barcode/. "
+. "-o fastqc_report.html "
+. "-n \"FASTQC $barcode\" "
+. "-f \"FASTQ\" "
+. "-j \"$barcode$ext\" "
+. "-e $pythonJars"; 
+	# Assumes the bash shell is being used
+	open(CMD, "trap '' SIGPIPE; $cmd 2| grep -v \"Broken pipe\" |")
+          or die "Cannot run FASTQC: $!\n";
+        while(<CMD>){
+          # Can safely ignore blank lines and SIGPIPE warnings
+          next if /^\s*$/ or /Broken pipe/; 
+          print STDERR $_; # forward any other errors
+        }
+        close(CMD);
+	system("perl -i.bak -pe \"s/>FastQC Report</>FastQC Report<div><a href='..\\/index.html'>Back to Table of Contents<\\\/a><\\\/div></;s/Images|Icons/./\" $outDir/$barcode/fastqc_report.html");
+        system("unzip -o -d $outDir/$barcode -qq -j $outDir/$barcode/$barcode\_fastqc.zip $barcode\_fastqc/Icons/*.png");
+	# append to html file
+	print OUTFILE "<div><a href='$barcode/fastqc_report.html'>$barcode</a></div>";
+}
+
+
+print OUTFILE "</body></html>";
+close(OUTFILE);
+system("cp $htmlFile $outDir/index.html");
+system("cd $local_dir/$expandedRunName; rm $archiveFile; zip -r $archiveFile RunInfo.xml RunParameters.xml InterOp -q");
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rgFastQC.py	Wed Mar 25 13:43:47 2015 -0600
@@ -0,0 +1,220 @@
+"""
+# May 2013 ross added check for bogus gz extension - fastqc gets confused
+# added sanitizer for user supplied name
+# removed shell and make cl a sequence for Popen call
+# ross lazarus August 10 2012 in response to anon insecurity report
+wrapper for fastqc
+
+called as
+  <command interpreter="python">
+    rgFastqc.py -i $input_file -d $html_file.files_path -o $html_file -n "$out_prefix"
+  </command>
+
+
+
+Current release seems overly intolerant of sam/bam header strangeness
+Author notified...
+
+
+"""
+import re
+import os
+import sys
+import subprocess
+import optparse
+import shutil
+import tempfile
+import zipfile
+import gzip
+import magic
+
+
+def getFileString(fpath, outpath):
+    """
+    format a nice file size string
+    """
+    size = ''
+    fp = os.path.join(outpath, fpath)
+    s = '? ?'
+    if os.path.isfile(fp):
+        n = float(os.path.getsize(fp))
+        if n > 2**20:
+            size = ' (%1.1f MB)' % (n/2**20)
+        elif n > 2**10:
+            size = ' (%1.1f KB)' % (n/2**10)
+        elif n > 0:
+            size = ' (%d B)' % (int(n))
+        s = '%s %s' % (fpath, size) 
+    return s
+
+
+class FastQC():
+    """wrapper
+    """
+    
+    
+    def __init__(self,opts=None):
+        assert opts <> None
+        self.opts = opts
+        
+        
+    def run_fastqc(self):
+        """
+        In batch mode fastqc behaves not very nicely - will write to a new folder in
+        the same place as the infile called [infilebasename]_fastqc
+    rlazarus@omics:/data/galaxy/test$ ls FC041_1_sequence_fastqc
+    duplication_levels.png  fastqc_icon.png          per_base_n_content.png         per_sequence_gc_content.png       summary.txt
+    error.png               fastqc_report.html       per_base_quality.png           per_sequence_quality.png          tick.png
+    fastqc_data.txt         per_base_gc_content.png  per_base_sequence_content.png  sequence_length_distribution.png  warning.png
+
+        """
+        serr = ''
+        dummy,tlog = tempfile.mkstemp(prefix='rgFastQC',suffix=".log",dir=self.opts.outputdir)
+        sout = open(tlog, 'w')
+        fastq = os.path.basename(self.opts.input)
+        cl = [self.opts.executable,'--outdir=%s' % self.opts.outputdir]
+        if self.opts.informat in ['sam','bam']:
+            cl.append('--format=%s' % self.opts.informat)
+        if self.opts.contaminants <> None :
+            cl.append('--contaminants=%s' % self.opts.contaminants)
+        # patch suggested by bwlang https://bitbucket.org/galaxy/galaxy-central/pull-request/30
+        # use a symlink in a temporary directory so that the FastQC report reflects the history input file name
+        infname = self.opts.inputfilename
+        linf = infname.lower()
+        trimext = False
+        # decompression at upload currently does NOT remove this now bogus ending - fastqc will barf
+        # patched may 29 2013 until this is fixed properly
+        input_magic = magic.from_file(self.opts.input)
+        if ( linf.endswith('.gz') or linf.endswith('.gzip') or 'gzip' in input_magic): 
+            f = gzip.open(self.opts.input)
+            try:
+                testrow = f.readline()
+            except:
+                trimext = True
+            f.close()
+        elif linf.endswith('bz2'):
+            f = bz2.open(self.opts.input,'rb')
+            try:
+                f.readline()
+            except:
+                trimext = True
+            f.close()
+        elif linf.endswith('.zip'):
+            if not zipfile.is_zipfile(self.opts.input):
+                trimext = True
+        if trimext:
+            infname = os.path.splitext(infname)[0]
+        fastqinfilename = re.sub(ur'[^a-zA-Z0-9_\-\.]', '_', os.path.basename(infname))
+        link_name = os.path.join(self.opts.outputdir, fastqinfilename)
+        os.symlink(self.opts.input, link_name)
+        cl.append(link_name)        
+        if('gzip' in input_magic):
+            sout.write('# File magic = %s\n' % input_magic)
+        sout.write('# FastQC cl = %s\n' % ' '.join(cl))
+        sout.flush()
+        p = subprocess.Popen(cl, shell=False, stderr=sout, stdout=sout, cwd=self.opts.outputdir)
+        retval = p.wait()
+        sout.close()
+        runlog = open(tlog,'r').readlines()
+        os.unlink(link_name)
+        flist = os.listdir(self.opts.outputdir) # fastqc plays games with its output directory name. eesh
+        odpath = None
+        for f in flist:
+            d = os.path.join(self.opts.outputdir,f)
+            if os.path.isdir(d):
+                if d.endswith('_fastqc'):
+                    odpath = d 
+        hpath = None
+        if odpath <> None:
+            try: 
+                hpath = os.path.join(odpath,'fastqc_report.html')
+                rep = open(hpath,'r').readlines() # for our new html file but we need to insert our stuff after the <body> tag
+            except:
+                pass
+        if hpath == None:
+            serr = '\n'.join(runlog)       
+            res =  ['## odpath=%s: No output found in %s. Output for the run was:<pre>\n' % (odpath,hpath),]
+            res += runlog
+            res += ['</pre>\n',
+                   'Please read the above for clues<br/>\n',
+                   'If you selected a sam/bam format file, it might not have headers or they may not start with @HD?<br/>\n',
+                   'It is also possible that the log shows that fastqc is not installed?<br/>\n',
+                   'If that is the case, please tell the relevant Galaxy administrator that it can be snarfed from<br/>\n',
+                   'http://www.bioinformatics.bbsrc.ac.uk/projects/fastqc/<br/>\n',]
+            return res,1,serr
+        self.fix_fastqcimages(odpath)
+        flist = os.listdir(self.opts.outputdir) # these have now been fixed
+        excludefiles = ['tick.png','warning.png','fastqc_icon.png','error.png']
+        flist = [x for x in flist if not x in excludefiles]
+        for i in range(len(rep)): # need to fix links to Icons and Image subdirectories in lastest fastqc code - ugh
+            rep[i] = rep[i].replace('Icons/','')
+            rep[i] = rep[i].replace('Images/','')
+
+        html = self.fix_fastqc(rep,flist,runlog)
+        return html,retval,serr
+        
+
+        
+    def fix_fastqc(self,rep=[],flist=[],runlog=[]):
+        """ add some of our stuff to the html
+        """
+        bodyindex = len(rep) -1  # hope they don't change this
+        footrow = bodyindex - 1 
+        footer = rep[footrow]
+        rep = rep[:footrow] + rep[footrow+1:]
+        res = ['<div class="module"><h2>Files created by FastQC</h2><table cellspacing="2" cellpadding="2">\n']
+        flist.sort()
+        for i,f in enumerate(flist):
+            if not(os.path.isdir(f)):
+                fn = os.path.split(f)[-1]
+                res.append('<tr><td><a href="%s">%s</a></td></tr>\n' % (fn,getFileString(fn, self.opts.outputdir)))
+        res.append('</table>\n') 
+        res.append('<a href="http://www.bioinformatics.bbsrc.ac.uk/projects/fastqc/">FastQC documentation and full attribution is here</a><br/><hr/>\n')
+        res.append('FastQC was run by Galaxy using the rgenetics rgFastQC wrapper - see http://bitbucket.org/rgenetics for details and licensing\n</div>')
+        res.append(footer)
+        fixed = rep[:bodyindex] + res + rep[bodyindex:]
+        return fixed # with our additions
+
+
+    def fix_fastqcimages(self,odpath):
+        """ Galaxy wants everything in the same files_dir
+        """
+        icpath = os.path.join(odpath,'Icons')
+        impath = os.path.join(odpath,'Images')
+        for adir in [icpath,impath,odpath]:
+            if os.path.exists(adir):
+                flist = os.listdir(adir) # get all files created
+                for f in flist:
+                    if not os.path.isdir(os.path.join(adir,f)):
+                        sauce = os.path.join(adir,f)
+                        dest = os.path.join(self.opts.outputdir,f)
+                        shutil.move(sauce,dest)
+                os.rmdir(adir)
+
+    
+
+if __name__ == '__main__':
+    op = optparse.OptionParser()
+    op.add_option('-i', '--input', default=None)
+    op.add_option('-j', '--inputfilename', default=None)    
+    op.add_option('-o', '--htmloutput', default=None)
+    op.add_option('-d', '--outputdir', default="/tmp/shortread")
+    op.add_option('-f', '--informat', default='fastq')
+    op.add_option('-n', '--namejob', default='rgFastQC')
+    op.add_option('-c', '--contaminants', default=None)
+    op.add_option('-e', '--executable', default='fastqc')
+    opts, args = op.parse_args()
+    assert opts.input <> None
+    assert os.path.isfile(opts.executable),'##rgFastQC.py error - cannot find executable %s' % opts.executable
+    if not os.path.exists(opts.outputdir): 
+        os.makedirs(opts.outputdir)
+    f = FastQC(opts)
+    html,retval,serr = f.run_fastqc()
+    f = open(opts.htmloutput, 'w')
+    f.write(''.join(html))
+    f.close()
+    if retval <> 0:
+        print >> sys.stderr, serr # indicate failure
+         
+    
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/transfer_convert_nextseq.loc	Wed Mar 25 13:43:47 2015 -0600
@@ -0,0 +1,1 @@
+local_dir=/export/achri_data/Runs/VetMedNextSeq
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/transfer_convert_nextseq.xml	Wed Mar 25 13:43:47 2015 -0600
@@ -0,0 +1,25 @@
+<?xml version="1.0"?>
+
+<tool id='nextseq_transfer_convert' name="Transfer and convert NextSeq run" version="1.0.0" hidden="false">
+	
+	<description>
+		Transfer and convert a NextSeq run
+	</description>
+
+	<command interpreter="perl">
+		copyNextSeq.pl $__tool_data_path__ --run $run_name --user $__user_email__ --toolDir $__tool_data_path__ --out $html_file.files_path --html $html_file --samplesheet $samplesheet --archive $sav_archive
+	</command>
+	
+	<inputs>
+		<param name="run_name" type="text" size="40" label="Run Name or Illumina Cartridge Identifier" 
+		       help="A text string unique enough to unambiguously identify the autogenerated run name from the sequencer, such as the cartridge barcode (e.g. H35VJBGXX)"/>
+		<param name="samplesheet" type="data" format="tabular" label="Illumina Sample Sheet"
+		       help="A comma separated spreadsheet with Header, Reads, Settings and Data sections defining the experimental setup, as generated using BaseSpace or Illumina Experiment Manager (IEM) v1.8.2, or later"/> 
+	</inputs>
+
+	<outputs>
+	       <data format="html" name="html_file" label="FAST QC Web report" />
+	       <data format="zip" name="sav_archive" label="SAV files archive" />
+	</outputs>
+	<help>This tools downloads a NextSeq run from the local machine and performs base call to fastq conversion, followed by QC reports on sample data, including a zipped archive with all the files required to view metrics in Illumina's SAV software.</help>
+</tool>