Mercurial > repos > yusuf > transfer_convert_nextseq
changeset 0:d4ac6e05c96c default tip
initial commit
| author | Yusuf Ali <ali@yusuf.email> | 
|---|---|
| date | Wed, 25 Mar 2015 13:43:47 -0600 | 
| parents | |
| children | |
| files | copyNextSeq.pl rgFastQC.py tool-data/transfer_convert_nextseq.loc transfer_convert_nextseq.xml | 
| diffstat | 4 files changed, 434 insertions(+), 0 deletions(-) [+] | 
line wrap: on
 line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/copyNextSeq.pl Wed Mar 25 13:43:47 2015 -0600 @@ -0,0 +1,188 @@ +#!/usr/bin/perl +use strict; +use warnings; +use Getopt::Long; +use File::Find; +use File::Basename; +use vars qw(@fastq_files); + +my $dirname = dirname(__FILE__); +my $pythonScript = "$dirname/rgFastQC.py"; +my $tool_dir = shift @ARGV; +my $pythonJars = "$tool_dir/shared/jars/FastQC/fastqc"; + +# Site config +my $num_threads = 32; +my $fastq_sample_size = 400000; +my $seq_host = "10.81.192.138"; +my $seq_username = "nextseq-user"; +my $seq_dir = "Desktop/Share"; + +#get localdir +if(not -e "$tool_dir/transfer_convert_nextseq.loc"){ + system("cat $dirname/tool-data/transfer_convert_nextseq.loc > $tool_dir/transfer_convert_nextseq.loc"); +} +open FILE, "$tool_dir/transfer_convert_nextseq.loc" or die "Could not open configuration file: $!\n"; +my @keys = split("=",<FILE>); +(my $local_dir = $keys[$#keys]) =~s/\s+//g; +close FILE; + +# store arguments into variables +my $runName; +my $sampleSheet; +my $user; +my $accessFile; +my $outDir; +my $htmlFile; +my $archiveFile; + +GetOptions ("run=s" => \$runName, + "samplesheet=s" => \$sampleSheet, + "user=s" => \$user, + "toolDir=s" => \$accessFile, + "out=s" => \$outDir, + "html=s" => \$htmlFile, + "archive=s" => \$archiveFile); + +if(not defined $runName or not defined $sampleSheet or not defined $user or not defined $accessFile or not defined $outDir or not defined $htmlFile){ + die "Usage: $0 -run <unique_suffix> -samplesheet <illumina.csv> -user <user\@domain in nextseq_access.conf> -toolDir <galaxy tool conf dir> ", + "-out <output dir for FASTQC report> -html <FASTQC report file name> -archive <SAV files.zip>\n"; +} + +$accessFile = "$accessFile/nextseq_access.conf"; + +# create access file if not already there +my $command = `touch $accessFile`; +open my $handle, '<', "$accessFile"; +chomp(my @allowed_users = <$handle>); + +$runName = quotemeta($runName); + +my ($out_file, $out_path, $out_ext ) = fileparse( $htmlFile, "\.[^.]*" ); + +# check to make sure $user is allowed to run script +if (! ($user ~~ @allowed_users) ){ + die "Please ask the administrator to add $user to $accessFile in order to gain access to this tool\n"; +} + +# First, sanity check the sample file +open(CSV, $sampleSheet) + or die "Cannot open $sampleSheet for reading: $!\n"; +undef $/; # slurp up whole file at once by undefining record separator +my @CSV = split /\r?\n/, <CSV>; # allow different endings +close(CSV); +$/="\n"; # restore normal per-line reading +my ($has_header, $has_reads, $has_data); +for(@CSV){ + if(/^\[Header\]/){ + $has_header = 1; + } + elsif(/^\[Reads\]/){ + $has_reads = 1; + } + elsif(/^\[Data\]/){ + $has_data = 1; + } +} +if(not defined $has_header){ + die "Header section is missing in sample sheet, please fix and resubmit this job\n"; +} +if(not defined $has_reads){ + die "Reads section is missing in sample sheet, please fix and resubmit this job\n"; +} +if(not defined $has_data){ + die "Data section is missing in sample sheet, please fix and resubmit this job\n"; +} + +# Expand the catridge ID into the full run name on the remote host, input should look something like "H35VJBGXX" +open(SSH, "ssh $seq_username\@$seq_host ls -1 $seq_dir |") + or die "Could not run ssh login to $seq_host: $!\n"; +my @matchOptions; +my @mismatchOptions; +while(<SSH>){ + chomp; + if(/$runName/o){ + push @matchOptions, $_; + } + else{ + push @mismatchOptions, $_; + } +} +close(SSH); +if(not @matchOptions){ + if(not @mismatchOptions){ + die "There was no data found on the rempote server at all, please ask the administrator to ", + "check this tool's setup (currently checking $seq_username\@$seq_host:$seq_dir)\n"; + } + # Keep only the ones not already uploaded as options + @mismatchOptions = grep {not -e "$local_dir/$_"} @mismatchOptions; + die "No run folder matching $runName was found at $seq_username\@$seq_host:$seq_dir, please try with another ", + "run name. The following would work currently: ", join(", ", @mismatchOptions), "\n"; +} +elsif(@matchOptions > 1){ + die "Ambiguous run name specification, please revise \"$runName\" to distinguish between existing datasets: ", + join(", ", @matchOptions), "\n"; +} +my $expandedRunName = $matchOptions[0]; # unambiguous, so proceed + +# if sample already exits as a folder, die +if(-e "$local_dir/$expandedRunName"){ +# die "Run $expandedRunName already exists on galaxy ($local_dir/$expandedRunName), cannot copy over\n"; +} +# if not, copy to folder +else{ +# system("scp -r $seq_username\@$seq_host\:$seq_dir/$expandedRunName $local_dir") >> 8 and die "Failed to copy from $seq_host to galaxy: scp exit status $?\n"; +} + +# Put the sample sheet where it needs to be with the transfered data +open(CSV, ">$local_dir/$expandedRunName/SampleSheet.csv") + or die "Cannot open $local_dir/$expandedRunName/SampleSheet.csv for writing: $!\nThe data files have been transfered, but no BCL to FASTQ conversion has taken place.\n"; +print CSV join("\n", @CSV); +close(CSV); + +# convert bcl files to fastq +#system("cd $local_dir/$expandedRunName; /export/common/programs/bcl2fastq/bin/bcl2fastq -r $num_threads -d $num_threads -p $num_threads -w $num_threads")>>8 +# and die "BCL to FASTQ conversion had non-zero exit status ($?). The BCL files were transfered, but FASTQ files were not generated.\n"; + +# Find the FASTQ files generated +find(sub{push @fastq_files, $File::Find::name if /\.fastq.gz$/}, "$local_dir/$expandedRunName"); + +# Run FASTQC on sample of data from each lane/barcode +# open output file and write html +open(OUTFILE, ">$htmlFile") + or die "Cannot open $htmlFile for writing: $!\n"; +print OUTFILE "<html><body><h1>Barcodes</h1>"; +system("mkdir -p $outDir"); + +# generate html plot using python tool +$SIG{'PIPE'} = 'IGNORE'; +my $cwd = dirname(__FILE__); +foreach my $file (@fastq_files){ + my ($barcode, $path, $ext ) = fileparse( $file, "\.fastq\.gz" ); + my $cmd = "gzip -cd $file | head -n $fastq_sample_size | python $pythonScript -i /dev/stdin " +. "-d $outDir/$barcode/. " +. "-o fastqc_report.html " +. "-n \"FASTQC $barcode\" " +. "-f \"FASTQ\" " +. "-j \"$barcode$ext\" " +. "-e $pythonJars"; + # Assumes the bash shell is being used + open(CMD, "trap '' SIGPIPE; $cmd 2| grep -v \"Broken pipe\" |") + or die "Cannot run FASTQC: $!\n"; + while(<CMD>){ + # Can safely ignore blank lines and SIGPIPE warnings + next if /^\s*$/ or /Broken pipe/; + print STDERR $_; # forward any other errors + } + close(CMD); + system("perl -i.bak -pe \"s/>FastQC Report</>FastQC Report<div><a href='..\\/index.html'>Back to Table of Contents<\\\/a><\\\/div></;s/Images|Icons/./\" $outDir/$barcode/fastqc_report.html"); + system("unzip -o -d $outDir/$barcode -qq -j $outDir/$barcode/$barcode\_fastqc.zip $barcode\_fastqc/Icons/*.png"); + # append to html file + print OUTFILE "<div><a href='$barcode/fastqc_report.html'>$barcode</a></div>"; +} + + +print OUTFILE "</body></html>"; +close(OUTFILE); +system("cp $htmlFile $outDir/index.html"); +system("cd $local_dir/$expandedRunName; rm $archiveFile; zip -r $archiveFile RunInfo.xml RunParameters.xml InterOp -q");
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rgFastQC.py Wed Mar 25 13:43:47 2015 -0600 @@ -0,0 +1,220 @@ +""" +# May 2013 ross added check for bogus gz extension - fastqc gets confused +# added sanitizer for user supplied name +# removed shell and make cl a sequence for Popen call +# ross lazarus August 10 2012 in response to anon insecurity report +wrapper for fastqc + +called as + <command interpreter="python"> + rgFastqc.py -i $input_file -d $html_file.files_path -o $html_file -n "$out_prefix" + </command> + + + +Current release seems overly intolerant of sam/bam header strangeness +Author notified... + + +""" +import re +import os +import sys +import subprocess +import optparse +import shutil +import tempfile +import zipfile +import gzip +import magic + + +def getFileString(fpath, outpath): + """ + format a nice file size string + """ + size = '' + fp = os.path.join(outpath, fpath) + s = '? ?' + if os.path.isfile(fp): + n = float(os.path.getsize(fp)) + if n > 2**20: + size = ' (%1.1f MB)' % (n/2**20) + elif n > 2**10: + size = ' (%1.1f KB)' % (n/2**10) + elif n > 0: + size = ' (%d B)' % (int(n)) + s = '%s %s' % (fpath, size) + return s + + +class FastQC(): + """wrapper + """ + + + def __init__(self,opts=None): + assert opts <> None + self.opts = opts + + + def run_fastqc(self): + """ + In batch mode fastqc behaves not very nicely - will write to a new folder in + the same place as the infile called [infilebasename]_fastqc + rlazarus@omics:/data/galaxy/test$ ls FC041_1_sequence_fastqc + duplication_levels.png fastqc_icon.png per_base_n_content.png per_sequence_gc_content.png summary.txt + error.png fastqc_report.html per_base_quality.png per_sequence_quality.png tick.png + fastqc_data.txt per_base_gc_content.png per_base_sequence_content.png sequence_length_distribution.png warning.png + + """ + serr = '' + dummy,tlog = tempfile.mkstemp(prefix='rgFastQC',suffix=".log",dir=self.opts.outputdir) + sout = open(tlog, 'w') + fastq = os.path.basename(self.opts.input) + cl = [self.opts.executable,'--outdir=%s' % self.opts.outputdir] + if self.opts.informat in ['sam','bam']: + cl.append('--format=%s' % self.opts.informat) + if self.opts.contaminants <> None : + cl.append('--contaminants=%s' % self.opts.contaminants) + # patch suggested by bwlang https://bitbucket.org/galaxy/galaxy-central/pull-request/30 + # use a symlink in a temporary directory so that the FastQC report reflects the history input file name + infname = self.opts.inputfilename + linf = infname.lower() + trimext = False + # decompression at upload currently does NOT remove this now bogus ending - fastqc will barf + # patched may 29 2013 until this is fixed properly + input_magic = magic.from_file(self.opts.input) + if ( linf.endswith('.gz') or linf.endswith('.gzip') or 'gzip' in input_magic): + f = gzip.open(self.opts.input) + try: + testrow = f.readline() + except: + trimext = True + f.close() + elif linf.endswith('bz2'): + f = bz2.open(self.opts.input,'rb') + try: + f.readline() + except: + trimext = True + f.close() + elif linf.endswith('.zip'): + if not zipfile.is_zipfile(self.opts.input): + trimext = True + if trimext: + infname = os.path.splitext(infname)[0] + fastqinfilename = re.sub(ur'[^a-zA-Z0-9_\-\.]', '_', os.path.basename(infname)) + link_name = os.path.join(self.opts.outputdir, fastqinfilename) + os.symlink(self.opts.input, link_name) + cl.append(link_name) + if('gzip' in input_magic): + sout.write('# File magic = %s\n' % input_magic) + sout.write('# FastQC cl = %s\n' % ' '.join(cl)) + sout.flush() + p = subprocess.Popen(cl, shell=False, stderr=sout, stdout=sout, cwd=self.opts.outputdir) + retval = p.wait() + sout.close() + runlog = open(tlog,'r').readlines() + os.unlink(link_name) + flist = os.listdir(self.opts.outputdir) # fastqc plays games with its output directory name. eesh + odpath = None + for f in flist: + d = os.path.join(self.opts.outputdir,f) + if os.path.isdir(d): + if d.endswith('_fastqc'): + odpath = d + hpath = None + if odpath <> None: + try: + hpath = os.path.join(odpath,'fastqc_report.html') + rep = open(hpath,'r').readlines() # for our new html file but we need to insert our stuff after the <body> tag + except: + pass + if hpath == None: + serr = '\n'.join(runlog) + res = ['## odpath=%s: No output found in %s. Output for the run was:<pre>\n' % (odpath,hpath),] + res += runlog + res += ['</pre>\n', + 'Please read the above for clues<br/>\n', + 'If you selected a sam/bam format file, it might not have headers or they may not start with @HD?<br/>\n', + 'It is also possible that the log shows that fastqc is not installed?<br/>\n', + 'If that is the case, please tell the relevant Galaxy administrator that it can be snarfed from<br/>\n', + 'http://www.bioinformatics.bbsrc.ac.uk/projects/fastqc/<br/>\n',] + return res,1,serr + self.fix_fastqcimages(odpath) + flist = os.listdir(self.opts.outputdir) # these have now been fixed + excludefiles = ['tick.png','warning.png','fastqc_icon.png','error.png'] + flist = [x for x in flist if not x in excludefiles] + for i in range(len(rep)): # need to fix links to Icons and Image subdirectories in lastest fastqc code - ugh + rep[i] = rep[i].replace('Icons/','') + rep[i] = rep[i].replace('Images/','') + + html = self.fix_fastqc(rep,flist,runlog) + return html,retval,serr + + + + def fix_fastqc(self,rep=[],flist=[],runlog=[]): + """ add some of our stuff to the html + """ + bodyindex = len(rep) -1 # hope they don't change this + footrow = bodyindex - 1 + footer = rep[footrow] + rep = rep[:footrow] + rep[footrow+1:] + res = ['<div class="module"><h2>Files created by FastQC</h2><table cellspacing="2" cellpadding="2">\n'] + flist.sort() + for i,f in enumerate(flist): + if not(os.path.isdir(f)): + fn = os.path.split(f)[-1] + res.append('<tr><td><a href="%s">%s</a></td></tr>\n' % (fn,getFileString(fn, self.opts.outputdir))) + res.append('</table>\n') + res.append('<a href="http://www.bioinformatics.bbsrc.ac.uk/projects/fastqc/">FastQC documentation and full attribution is here</a><br/><hr/>\n') + res.append('FastQC was run by Galaxy using the rgenetics rgFastQC wrapper - see http://bitbucket.org/rgenetics for details and licensing\n</div>') + res.append(footer) + fixed = rep[:bodyindex] + res + rep[bodyindex:] + return fixed # with our additions + + + def fix_fastqcimages(self,odpath): + """ Galaxy wants everything in the same files_dir + """ + icpath = os.path.join(odpath,'Icons') + impath = os.path.join(odpath,'Images') + for adir in [icpath,impath,odpath]: + if os.path.exists(adir): + flist = os.listdir(adir) # get all files created + for f in flist: + if not os.path.isdir(os.path.join(adir,f)): + sauce = os.path.join(adir,f) + dest = os.path.join(self.opts.outputdir,f) + shutil.move(sauce,dest) + os.rmdir(adir) + + + +if __name__ == '__main__': + op = optparse.OptionParser() + op.add_option('-i', '--input', default=None) + op.add_option('-j', '--inputfilename', default=None) + op.add_option('-o', '--htmloutput', default=None) + op.add_option('-d', '--outputdir', default="/tmp/shortread") + op.add_option('-f', '--informat', default='fastq') + op.add_option('-n', '--namejob', default='rgFastQC') + op.add_option('-c', '--contaminants', default=None) + op.add_option('-e', '--executable', default='fastqc') + opts, args = op.parse_args() + assert opts.input <> None + assert os.path.isfile(opts.executable),'##rgFastQC.py error - cannot find executable %s' % opts.executable + if not os.path.exists(opts.outputdir): + os.makedirs(opts.outputdir) + f = FastQC(opts) + html,retval,serr = f.run_fastqc() + f = open(opts.htmloutput, 'w') + f.write(''.join(html)) + f.close() + if retval <> 0: + print >> sys.stderr, serr # indicate failure + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/transfer_convert_nextseq.loc Wed Mar 25 13:43:47 2015 -0600 @@ -0,0 +1,1 @@ +local_dir=/export/achri_data/Runs/VetMedNextSeq
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/transfer_convert_nextseq.xml Wed Mar 25 13:43:47 2015 -0600 @@ -0,0 +1,25 @@ +<?xml version="1.0"?> + +<tool id='nextseq_transfer_convert' name="Transfer and convert NextSeq run" version="1.0.0" hidden="false"> + + <description> + Transfer and convert a NextSeq run + </description> + + <command interpreter="perl"> + copyNextSeq.pl $__tool_data_path__ --run $run_name --user $__user_email__ --toolDir $__tool_data_path__ --out $html_file.files_path --html $html_file --samplesheet $samplesheet --archive $sav_archive + </command> + + <inputs> + <param name="run_name" type="text" size="40" label="Run Name or Illumina Cartridge Identifier" + help="A text string unique enough to unambiguously identify the autogenerated run name from the sequencer, such as the cartridge barcode (e.g. H35VJBGXX)"/> + <param name="samplesheet" type="data" format="tabular" label="Illumina Sample Sheet" + help="A comma separated spreadsheet with Header, Reads, Settings and Data sections defining the experimental setup, as generated using BaseSpace or Illumina Experiment Manager (IEM) v1.8.2, or later"/> + </inputs> + + <outputs> + <data format="html" name="html_file" label="FAST QC Web report" /> + <data format="zip" name="sav_archive" label="SAV files archive" /> + </outputs> + <help>This tools downloads a NextSeq run from the local machine and performs base call to fastq conversion, followed by QC reports on sample data, including a zipped archive with all the files required to view metrics in Illumina's SAV software.</help> +</tool>
