changeset 0:bc23f6946bb8 default tip

Alternative barcode splitters that move selected results to the users history.
author Jim Johnson <jj@umn.edu>
date Tue, 19 Jul 2011 13:03:32 -0500
parents
children
files fastx_barcode_splitter.xml fastx_barcode_splitter_galaxy_wrapper.py fastx_barcode_splitter_single.xml fastx_barcode_splitter_single_galaxy_wrapper.py
diffstat 4 files changed, 328 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fastx_barcode_splitter.xml	Tue Jul 19 13:03:32 2011 -0500
@@ -0,0 +1,88 @@
+<tool id="cshl_fastx_barcode_splitter" name="Barcode Splitter" force_history_refresh="True">
+	<description></description>
+	<requirements><requirement type="package">fastx_toolkit</requirement></requirements>
+	<command interpreter="python">fastx_barcode_splitter_galaxy_wrapper.py 
+           ## params for galaxy wrapper
+           $output 
+           "$output.id" 
+           "$input.ext" 
+           "$__new_file_path__" 
+           --barcodes='$barcodes'
+           $BARCODE $input "$input.name" "$output.extra_files_path" 
+           ## params for fastx_barcode_splitter
+            --mismatches $mismatches --partial $partial $EOL 
+        </command>
+
+	<inputs>
+		<param format="txt" name="BARCODE" type="data" label="Barcodes to use" />
+                <param format="fasta,fastqsanger,fastqsolexa,fastqillumina" name="input" type="data" label="Library to split" />
+
+		<param name="EOL" type="select" label="Barcodes found at">
+			<option value="--bol">Start of sequence (5' end)</option>
+			<option value="--eol">End of sequence (3' end)</option>
+		</param>
+
+		<param name="mismatches" type="integer" size="3" value="2" label="Number of allowed mismatches" />
+		
+		<param name="partial" type="integer" size="3" value="0" label="Number of allowed barcodes nucleotide deletions" />
+
+                <param name="barcodes" type="select" multiple="true" label="Select barcodes to add as new datasets to history">
+		    <options from_dataset="BARCODE">
+    			<column name="name" index="0"/>
+    			<column name="value" index="0"/>
+                        <filter type="unique_value" name="unq_bc" column="0" />
+                        <filter type="add_value" name="unmatched" value="unmatched"/>
+		    </options>
+                </param>
+	</inputs>
+
+	<outputs>
+		<data format="html" name="output" />
+	</outputs>
+	
+	<tests>
+		<test>
+			<!-- Split a FASTQ file -->
+			<param name="BARCODE" value="fastx_barcode_splitter1.txt" />
+			<param name="input" value="fastx_barcode_splitter1.fastq" ftype="fastqsolexa" />
+			<param name="EOL" value="Start of sequence (5' end)" />
+			<param name="mismatches" value="2" />
+			<param name="partial" value="0" />
+			<output name="output" file="fastx_barcode_splitter1.out" />
+		</test>
+	</tests>
+
+<help>
+
+**What it does**
+
+This tool splits a Solexa library (FASTQ file) or a regular FASTA file into several files, using barcodes as the split criteria.
+
+--------
+
+**Barcode file Format**
+
+Barcode files are simple text files.
+Each line should contain an identifier (descriptive name for the barcode), and the barcode itself (A/C/G/T), separated by a TAB character.
+Example::
+
+    #This line is a comment (starts with a 'number' sign)
+    BC1	GATCT
+    BC2	ATCGT
+    BC3	GTGAT
+    BC4 TGTCT
+    
+For each barcode, a new FASTQ file will be created (with the barcode's identifier as part of the file name).
+Sequences matching the barcode will be stored in the appropriate file.
+
+One additional FASTQ file will be created (the 'unmatched' file), where sequences not matching any barcode will be stored.
+
+The output of this tool is an HTML file, displaying the split counts and the file locations.
+
+**Output Example**
+
+.. image:: ./static/fastx_icons/barcode_splitter_output_example.png
+
+</help>
+</tool>
+<!-- FASTX-barcode-splitter is part of the FASTX-toolkit, by A.Gordon (gordon@cshl.edu) -->
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fastx_barcode_splitter_galaxy_wrapper.py	Tue Jul 19 13:03:32 2011 -0500
@@ -0,0 +1,91 @@
+import sys, os, os.path, tempfile, shutil, re, shlex, subprocess
+
+def stop_err( msg ):
+    sys.stderr.write( "%s\n" % msg )
+    sys.exit()
+
+# tranform fastx_barcode_splitter result to html
+def results_to_html(results_path,html_path,basepath,print_stdout ):
+    pat = '%s[/]?([^\t]*)' % basepath
+    rep = '<a href=\"\\1\">\\1</a>'
+    txt = open(results_path,'r')
+    html = open(html_path,'w')
+    html.write('<html><body><table border=1>\n')
+    try:
+        for line in txt:
+            html.write('<tr><td>%s</td></tr>' % re.sub('\t','</td><td>',re.sub(pat,rep,line)))
+            if print_stdout:
+                print >> sys.stdout, '\t'.join(line.split('\t')[:2])
+    except Exception, e:
+        print(str(e))
+        pass
+    html.write('</table></body></html>\n')
+    html.close()
+    txt.close()
+
+def __main__():
+    """
+           ##params for galaxy wrapper
+           $output
+           "$output.id"
+           "$input.ext"
+           "$__new_file_path__"
+           --barcodes='$barcodes'
+           $BARCODE $input "$input.name" "$output.extra_files_path"
+           ## params for fastx_barcode_splitter
+            --mismatches $mismatches --partial $partial $EOL
+    """
+    output = sys.argv[1]
+    output_id = sys.argv[2]
+    file_ext = sys.argv[3]
+    new_file_path = sys.argv[4]
+    select_barcodes = sys.argv[5].replace('--barcodes=','')
+    barcodes = sys.argv[6]
+    fastx = sys.argv[7]
+    fastx_name = sys.argv[8]
+    extra_files_path = sys.argv[9]
+    script_args = ' '.join(sys.argv[10:])
+    #Sanitize library name, make sure we can create a file with this name
+    lib_name = re.sub('\W','_',re.sub('\.\W*$','',fastx_name))+'_'
+    prefix = os.path.join(extra_files_path,lib_name)
+    # Check that input datasets exist
+    if not os.path.isfile(fastx):
+        stop_err('Error: Input file (%s) not found!' % fastx)
+    if not os.path.isfile(barcodes):
+        stop_err('Error: barcode file (%s) not found!' % barcodes)
+    try:
+        # Check that extra_files_path exists
+        if not os.path.isdir(extra_files_path):
+            os.makedirs(extra_files_path)
+        cmd_line = 'zcat -f %s | fastx_barcode_splitter.pl --bcfile %s --prefix %s --suffix %s %s' %(fastx,barcodes,prefix,'.'+file_ext,script_args)
+        # print >> sys.stderr, cmd_line 
+        # Create file to collect results written to stdout
+        tmp_dir = tempfile.mkdtemp()
+        result_path = tempfile.NamedTemporaryFile(dir=tmp_dir, prefix='results_', suffix='.out' ).name
+        result_file = open( result_path, 'wb' )
+        proc = subprocess.Popen( args=cmd_line, shell=True, cwd=tmp_dir, stderr=subprocess.PIPE,stdout=result_file.fileno() )
+        returncode = proc.wait()
+        result_file.close()
+        stderr = proc.stderr.read()
+        if returncode != 0:
+            raise Exception, stderr
+        # copy results to ouptut
+        results_to_html(result_path,output,extra_files_path,True)
+        # make new datasets for selected barcodes
+        if select_barcodes != None and len(select_barcodes) > 0:
+            flist = os.listdir(extra_files_path)
+            for barcode in select_barcodes.split(','):
+                for fname in flist:
+                    if fname.find('_'+barcode+'.'+file_ext) >= 0:
+                        fpath = os.path.join(extra_files_path,fname)
+                        # filename pattern required by galaxy 
+                        fn = "%s_%s_%s_%s_%s" % ( 'primary', output_id, barcode, 'visible', file_ext )
+                        npath = os.path.join(new_file_path,fn)
+                        try:
+                            os.link(fpath, npath)
+                        except:
+                            shutil.copy2(fpath, npath)
+    except Exception, e:
+        raise Exception, 'Exception caught attempting conversion: ' + str( e )
+
+if __name__ == "__main__": __main__()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fastx_barcode_splitter_single.xml	Tue Jul 19 13:03:32 2011 -0500
@@ -0,0 +1,63 @@
+<tool id="cshl_fastx_barcode_splitter_single" name="Barcode Splitter (Single)">
+	<description></description>
+	<requirements><requirement type="package">fastx_toolkit</requirement></requirements>
+	<command interpreter="python">fastx_barcode_splitter_single_galaxy_wrapper.py 
+           $matched_output 
+           $unmatched_output
+           "$input.ext" 
+           --barcodes='$barcode'
+           $input "$input.name"
+            --mismatches $mismatches --partial $partial $EOL 
+        </command>
+
+	<inputs>
+		<!-- <param format="txt" name="BARCODE" type="data" label="Barcodes to use" /> -->
+    <param format="fasta,fastqsanger,fastqsolexa,fastqillumina" name="input" type="data" label="Library to split" />
+
+		<param name="EOL" type="select" label="Barcodes found at">
+			<option value="--bol">Start of sequence (5' end)</option>
+			<option value="--eol">End of sequence (3' end)</option>
+		</param>
+
+		<param name="mismatches" type="integer" size="3" value="2" label="Number of allowed mismatches" />
+		
+		<param name="partial" type="integer" size="3" value="0" label="Number of allowed barcodes nucleotide deletions" />
+
+    <param name="barcode" type="text" label="Barcode to extract" />
+
+    <!-- 
+                <param name="barcodes" type="select" multiple="true" label="Select barcodes to add as new datasets to history">
+		    <options from_dataset="BARCODE">
+    			<column name="name" index="0"/>
+    			<column name="value" index="0"/>
+                        <filter type="unique_value" name="unq_bc" column="0" />
+                        <filter type="add_value" name="unmatched" value="unmatched"/>
+		    </options>
+                </param>
+    -->
+	</inputs>
+
+	<outputs>
+		<data format_source="input" name="matched_output" label="Barcode Splitter on ${input.name} (Matching sequences)" />
+    <data format_source="input" name="unmatched_output" label="Barcode Splitter on ${input.name} (Unmatched sequences)" />
+	</outputs>
+	
+	<tests>
+	</tests>
+
+<help>
+
+**What it does**
+
+This tool splits a Solexa library (FASTQ file) or a regular FASTA file into two files using a barcode as the split criteria.
+
+--------
+
+A new FASTQ file will be created (with the barcode's identifier as part of the file name).
+Sequences matching the barcode will be stored in the appropriate file.
+
+An additional FASTQ file will be created (the 'unmatched' file), where sequences not matching this barcode will be stored.
+
+</help>
+</tool>
+<!-- FASTX-barcode-splitter is part of the FASTX-toolkit, by A.Gordon (gordon@cshl.edu) -->
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fastx_barcode_splitter_single_galaxy_wrapper.py	Tue Jul 19 13:03:32 2011 -0500
@@ -0,0 +1,86 @@
+import sys, os, os.path, tempfile, shutil, re, shlex, subprocess
+
+def stop_err( msg ):
+    sys.stderr.write( "%s\n" % msg )
+    sys.exit()
+
+# tranform fastx_barcode_splitter result to html
+def results_to_html(results_path,html_path,basepath,print_stdout ):
+    pat = '%s[/]?([^\t]*)' % basepath
+    rep = '<a href=\"\\1\">\\1</a>'
+    txt = open(results_path,'r')
+    html = open(html_path,'w')
+    html.write('<html><body><table border=1>\n')
+    try:
+        for line in txt:
+            html.write('<tr><td>%s</td></tr>' % re.sub('\t','</td><td>',re.sub(pat,rep,line)))
+            if print_stdout:
+                print >> sys.stdout, '\t'.join(line.split('\t')[:2])
+    except Exception, e:
+        print(str(e))
+        pass
+    html.write('</table></body></html>\n')
+    html.close()
+    txt.close()
+
+def __main__():
+    """
+           ##params for galaxy wrapper
+           $output
+           "$output.id"
+           "$input.ext"
+           "$__new_file_path__"
+           --barcodes='$barcodes'
+           $BARCODE $input "$input.name" "$output.extra_files_path"
+           ## params for fastx_barcode_splitter
+            --mismatches $mismatches --partial $partial $EOL
+    """
+    
+    output = sys.argv[1]
+    output_unmatched =  sys.argv[2]
+    file_ext = sys.argv[3]
+    select_barcode = sys.argv[4].replace('--barcodes=','')
+    barcodes = os.path.abspath("barcodes")
+    with open(barcodes, 'w') as f:
+        f.write("barcode\t%s\n" % (select_barcode))
+    
+    #barcodes = sys.argv[6]
+    fastx = sys.argv[5]
+    fastx_name = sys.argv[6]
+    #extra_files_path = sys.argv[9]
+    script_args = ' '.join(sys.argv[7:])
+    #Sanitize library name, make sure we can create a file with this name
+    lib_name = re.sub('\W','_',re.sub('\.\W*$','',fastx_name))+'_'
+    # Check that input datasets exist
+    if not os.path.isfile(fastx):
+        stop_err('Error: Input file (%s) not found!' % fastx)
+    try:
+        prefix = lib_name
+        cmd_line = 'zcat -f %s | fastx_barcode_splitter.pl --bcfile %s --prefix %s --suffix %s %s' %(fastx,barcodes,prefix,'.'+file_ext,script_args)
+        # print >> sys.stderr, cmd_line 
+        # Create file to collect results written to stdout
+        tmp_dir = tempfile.mkdtemp()
+        result_path = tempfile.NamedTemporaryFile(dir=tmp_dir, prefix='results_', suffix='.out' ).name
+        result_file = open( result_path, 'wb' )
+        proc = subprocess.Popen( args=cmd_line, shell=True, cwd=tmp_dir, stderr=subprocess.PIPE,stdout=result_file.fileno() )
+        returncode = proc.wait()
+        result_file.close()
+        stderr = proc.stderr.read()
+        if returncode != 0:
+            raise Exception, stderr
+        # copy results to ouptut
+        #results_to_html(result_path,output,extra_files_path,True)
+        # make new datasets for selected barcodes
+        flist = os.listdir(tmp_dir)
+        for fname in flist:
+            if fname.find('_'+barcode+'.'+file_ext) >= 0:
+                fpath = os.path.join(tmp_dir,fname)
+                shutil.copy2(fpath, output)
+        for fname in flist:
+            if fname.find('_unmatched.' + file_ext) > 0:
+                fpath = os.path.join(tmp_dir, fname)
+                shutil.copy2(fpath, output_unmatched)
+    except Exception, e:
+        raise Exception, 'Exception caught attempting conversion: ' + str( e )
+
+if __name__ == "__main__": __main__()