changeset 0:c1eeccec29d1

Uploaded
author ryo-tas
date Mon, 13 Feb 2012 02:34:11 -0500
parents
children 845215ebd132
files macs14/macs14_wrapper.py macs14/macs14_wrapper.xml
diffstat 2 files changed, 370 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macs14/macs14_wrapper.py	Mon Feb 13 02:34:11 2012 -0500
@@ -0,0 +1,132 @@
+import sys, subprocess, tempfile, shutil, glob, os, os.path, gzip
+from galaxy import eggs
+import pkg_resources
+pkg_resources.require( "simplejson" )
+import simplejson
+
+CHUNK_SIZE = 1024
+
+def gunzip_cat_glob_path( glob_path, target_filename, delete = False ):
+    out = open( target_filename, 'wb' )
+    for filename in glob.glob( glob_path ):
+        fh = gzip.open( filename, 'rb' )
+        while True:
+            data = fh.read( CHUNK_SIZE )
+            if data:
+                out.write( data )
+            else:
+                break
+        fh.close()
+        if delete:
+            os.unlink( filename )
+    out.close()
+
+def xls_to_interval( xls_file, interval_file, header = None ):
+    out = open( interval_file, 'wb' )
+    if header:
+        out.write( '#%s\n' % header )
+    wrote_header = False
+    #From macs readme: Coordinates in XLS is 1-based which is different with BED format.
+    for line in open( xls_file ):
+        #keep all existing comment lines
+        if line.startswith( '#' ):
+            out.write( line )
+        elif not wrote_header:
+            out.write( '#%s' % line )
+            wrote_header = True
+        else:
+            fields = line.split( '\t' )
+            if len( fields ) > 1:
+                fields[1] = str( int( fields[1] ) - 1 )
+            out.write( '\t'.join( fields ) )
+    out.close()
+
+def main():
+    options = simplejson.load( open( sys.argv[1] ) )
+    output_bed = sys.argv[2]
+    output_extra_html = sys.argv[3]
+    output_extra_path = sys.argv[4]
+    
+    experiment_name = '_'.join( options['experiment_name'].split() ) #save experiment name here, it will be used by macs for filenames (gzip of wig files will fail with spaces - macs doesn't properly escape them)..need to replace all whitespace, split makes this easier
+    cmdline = "macs14 -t %s" % ",".join( options['input_chipseq'] )
+    if options['input_control']:
+        cmdline = "%s -c %s" % ( cmdline, ",".join( options['input_control'] ) )
+    cmdline = "%s --format='%s' --name='%s' --gsize='%s' --tsize='%s' --bw='%s' --pvalue='%s' --mfold='%s' %s --slocal='%s' --llocal='%s'  %s" % ( cmdline, options['format'], experiment_name, options['gsize'], options['tsize'], options['bw'], options['pvalue'], options['mfold'], options['nolambda'], options['slocal'], options['llocal'], options['futurefdr'] )
+    if 'wig' in options:
+        cmdline = "%s --wig --space='%s'" % ( cmdline, options['wig']['space'] )
+    if 'nomodel' in options:
+        cmdline = "%s --nomodel --shiftsize='%s'" % ( cmdline, options['nomodel'] )
+    if 'diag' in options:
+        cmdline = "%s --diag --fe-min='%s' --fe-max='%s' --fe-step='%s'" % ( cmdline, options['diag']['fe-min'], options['diag']['fe-max'], options['diag']['fe-step'] )
+   
+    cmdline_macs14 = cmdline 
+    tmp_dir = tempfile.mkdtemp() #macs makes very messy output, need to contain it into a temp dir, then provide to user
+    stderr_name = tempfile.NamedTemporaryFile().name # redirect stderr here, macs provides lots of info via stderr, make it into a report
+    proc = subprocess.Popen( args=cmdline, shell=True, cwd=tmp_dir, stderr=open( stderr_name, 'wb' ) )
+    proc.wait()
+    #We don't want to set tool run to error state if only warnings or info, e.g. mfold could be decreased to improve model, but let user view macs log
+    #Do not terminate if error code, allow dataset (e.g. log) creation and cleanup
+    if proc.returncode:
+        stderr_f = open( stderr_name )
+        while True:
+            chunk = stderr_f.read( CHUNK_SIZE )
+            if not chunk:
+                stderr_f.close()
+                break
+            sys.stderr.write( chunk )
+    
+    #run R to create pdf from model script
+    if os.path.exists( os.path.join( tmp_dir, "%s_model.r" % experiment_name ) ):
+        cmdline = 'R --vanilla --slave < "%s_model.r" > "%s_model.r.log"' % ( experiment_name, experiment_name )
+        proc = subprocess.Popen( args=cmdline, shell=True, cwd=tmp_dir )
+        proc.wait()
+    
+    
+    #move bed out to proper output file
+    created_bed_name =  os.path.join( tmp_dir, "%s_peaks.bed" % experiment_name )
+    if os.path.exists( created_bed_name ):
+        shutil.move( created_bed_name, output_bed )
+    
+    #parse xls files to interval files as needed
+    if options['xls_to_interval']:
+        create_peak_xls_file = os.path.join( tmp_dir, '%s_peaks.xls' % experiment_name )
+        if os.path.exists( create_peak_xls_file ):
+            xls_to_interval( create_peak_xls_file, options['xls_to_interval']['peaks_file'], header = 'peaks file' )
+        create_peak_xls_file = os.path.join( tmp_dir, '%s_negative_peaks.xls' % experiment_name )
+        if os.path.exists( create_peak_xls_file ):
+            xls_to_interval( create_peak_xls_file, options['xls_to_interval']['negative_peaks_file'], header = 'negative peaks file' )
+    
+    #merge and move wig files as needed, delete gz'd files and remove emptied dirs
+    if 'wig' in options:
+        wig_base_dir = os.path.join( tmp_dir, "%s_MACS_wiggle" % experiment_name )
+        if os.path.exists( wig_base_dir ):
+            #treatment
+            treatment_dir = os.path.join( wig_base_dir, "treat" )
+            if os.path.exists( treatment_dir ):
+                gunzip_cat_glob_path( os.path.join( treatment_dir, "*.wig.gz" ), options['wig']['output_treatment_file'], delete = True )
+                os.rmdir( treatment_dir )
+                #control
+                if options['input_control']:
+                    control_dir = os.path.join( wig_base_dir, "control" )
+                    if os.path.exists( control_dir ):
+                        gunzip_cat_glob_path( os.path.join( control_dir, "*.wig.gz" ), options['wig']['output_control_file'], delete = True )
+                        os.rmdir( control_dir )
+            os.rmdir( wig_base_dir )
+    
+    #move all remaining files to extra files path of html file output to allow user download
+    out_html = open( output_extra_html, 'wb' )
+    out_html.write( '<html><head><title>Additional output created by MACS (%s)</title></head><body><h3>Additional Files:</h3><p><ul>\n' % experiment_name )
+    os.mkdir( output_extra_path )
+    for filename in sorted( os.listdir( tmp_dir ) ):
+        shutil.move( os.path.join( tmp_dir, filename ), os.path.join( output_extra_path, filename ) )
+        out_html.write( '<li><a href="%s">%s</a></li>\n' % ( filename, filename ) )
+    out_html.write( '</ul></p>\n' )
+    out_html.write( '<h3>CMD Executed:</h3>\n<p><pre>%s</pre></p>\n' % ( cmdline_macs14 ) )
+    out_html.write( '<h3>Messages from MACS:</h3>\n<p><pre>%s</pre></p>\n' % open( stderr_name, 'rb' ).read() )
+    out_html.write( '</body></html>\n' )
+    out_html.close()
+    
+    os.unlink( stderr_name )
+    os.rmdir( tmp_dir )
+
+if __name__ == "__main__": main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macs14/macs14_wrapper.xml	Mon Feb 13 02:34:11 2012 -0500
@@ -0,0 +1,238 @@
+<tool id="peakcalling_macs14" name="MACS14" version="1.4.1">
+  <description>Model-based Analysis of ChIP-Seq (1.4.1)</description>
+  <command interpreter="python">macs14_wrapper.py $options_file $output_bed_file $output_extra_files $output_extra_files.files_path</command>
+  <requirements>
+    <requirement type="python-module">macs14</requirement>
+    <requirement type="package">macs14</requirement>
+  </requirements>
+  <inputs>
+    <param name="experiment_name" type="text" value="MACS14 in Galaxy" size="50" label="Experiment Name"/>
+    <conditional name="input_type">
+      <param name="input_type_selector" type="select" label="Paired End Sequencing">
+        <option value="paired_end">Paired End (requires elandmulti format)</option>
+        <option value="single_end" selected="true">Single End</option>
+      </param>
+      <when value="paired_end">
+        <param name="input_chipseq_file1" type="data" format="elandmulti" label="ChIP-Seq Tag File 1" />
+        <param name="input_chipseq_file2" type="data" format="elandmulti" label="ChIP-Seq Tag File 2" />
+        <param name="input_control_file1" type="data" format="elandmulti" optional="True" label="ChIP-Seq Control File 1" />
+        <param name="input_control_file2" type="data" format="elandmulti" optional="True" label="ChIP-Seq Control File 2" />
+        <param name="petdist" type="integer" label="Best distance between Pair-End Tags" value="200"/>
+      </when>
+      <when value="single_end">
+        <param name="input_chipseq_file1" type="data" format="bed,sam,bam,eland,elandmulti" label="ChIP-Seq Tag File" />
+        <param name="input_control_file1" type="data" format="bed,sam,bam,eland,elandmulti" optional="True" label="ChIP-Seq Control File" />
+      </when>
+    </conditional>
+    <param name="gsize" type="float" label="Effective genome size" value="2.7e+9" help="default: 2.7e+9"/>
+    <param name="tsize" type="integer" label="Tag size" value="25"/>
+    <param name="bw" type="integer" label="Band width" value="300"/>
+    <param name="pvalue" type="float" label="Pvalue cutoff for peak detection" value="1e-5" help="default: 1e-5"/>
+    <param name="mfold" type="text" label="Select the regions within MFOLD range of high-confidence enrichment ratio against background to build model. The regions must be lower than upper limit, and higher than the lower limit" value="10,30"/>
+    <param name="xls_to_interval" label="Parse xls files into into distinct interval files" type="boolean" truevalue="create" falsevalue="do_not_create" checked="False"/>
+    <conditional name="wig_type">
+      <param name="wig_type_selector" type="select" label="Save shifted raw tag count at every bp into a wiggle file">
+        <option value="wig">Save</option>
+        <option value="no_wig" selected="true">Do not create wig file (faster)</option>
+      </param>
+      <when value="wig">
+        <!--
+        <param name="wigextend" type="integer" label="Extend tag from its middle point to a wigextend size fragment." value="-1" help="Use value less than 0 for default (modeled d)"/>
+        -->
+        <param name="space" type="integer" label="Resolution for saving wiggle files" value="10"/>
+      </when>
+      <when value="no_wig">
+        <!-- do nothing here -->
+      </when>
+
+    </conditional>
+    <param name="nolambda" label="Use fixed background lambda as local lambda for every peak region" type="boolean" truevalue="--nolambda" falsevalue="" checked="False" help="up to 9X more time consuming"/>
+    <!--
+    <param name="lambdaset" type="text" label="3 levels of regions around the peak region to calculate the maximum lambda as local lambda" value="1000,5000,10000" size="50"/>
+    -->
+    <param name="slocal" type="text" label="The small nearby region in basepairs to calculate dynamic lambda. This is used to capture the bias near the peak summit region. Invalid if there is no control data" value="1000" size="10"/>
+    <param name="llocal" type="text" label="The large nearby region in basepairs to calculate dynamic lambda. This is used to capture the surround bias" value="10000" size="10"/>
+    <conditional name="nomodel_type">
+      <param name="nomodel_type_selector" type="select" label="Build Model">
+        <option value="nomodel">Do not build the shifting model</option>
+        <option value="create_model" selected="true">Build the shifting model</option>
+      </param>
+      <when value="nomodel">
+        <param name="shiftsize" type="integer" label="Arbitrary shift size in bp" value="100"/>
+      </when>
+      <when value="create_model">
+        <!-- do nothing here -->
+      </when>
+    </conditional>
+    <conditional name="diag_type">
+      <param name="diag_type_selector" type="select" label="Diagnosis report" help="up to 9X more time consuming">
+        <option value="diag">Produce a diagnosis report</option>
+        <option value="no_diag" selected="true">Do not produce report (faster)</option>
+      </param>
+      <when value="diag">
+        <param name="fe-min" type="integer" label="Min fold enrichment to consider" value="0"/>
+        <param name="fe-max" type="integer" label="Max fold enrichment to consider" value="32"/>
+        <param name="fe-step" type="integer" label="Fold enrichment step" value="20"/>
+      </when>
+      <when value="no_diag">
+        <!-- do nothing here -->
+      </when>
+    </conditional>
+    <param name="futurefdr" label="Perform the new peak detection method (futurefdr)" type="boolean" truevalue="--futurefdr" falsevalue="" checked="False" help="The default method only consider the peak location, 1k, 5k, and 10k regions in the control data; whereas the new future method also consider the 5k, 10k regions in treatment data to calculate local bias."/>
+  </inputs>
+  <outputs>
+    <data name="output_bed_file" format="bed" label="${tool.name} on ${on_string} (peaks: bed)"/>
+    <data name="output_xls_to_interval_peaks_file" format="interval" label="${tool.name} on ${on_string} (peaks: interval)">
+      <filter>xls_to_interval is True</filter>
+    </data>
+    <data name="output_xls_to_interval_negative_peaks_file" format="interval" label="${tool.name} on ${on_string} (negative peaks: interval)">
+      <filter>xls_to_interval is True</filter>
+      <filter>input_type['input_control_file1'] is not None</filter>
+    </data>
+    <data name="output_treatment_wig_file" format="wig" label="${tool.name} on ${on_string} (treatment: wig)">
+      <filter>wig_type['wig_type_selector']=='wig'</filter>
+    </data>
+    <data name="output_control_wig_file" format="wig" label="${tool.name} on ${on_string} (control: wig)">
+      <filter>wig_type['wig_type_selector'] == 'wig'</filter>
+      <filter>input_type['input_control_file1'] is not None</filter>
+    </data>
+    <data name="output_extra_files" format="html" label="${tool.name} on ${on_string} (html report)"/>
+  </outputs>
+  <configfiles>
+    <configfile name="options_file">&lt;%
+import simplejson
+%&gt;
+#set $__options = { 'experiment_name':str( $experiment_name ), 'gsize':int( float( str( $gsize ) ) ), 'tsize':str( $tsize ), 'bw':str( $bw ), 'pvalue':str( $pvalue ), 'mfold':str( $mfold ), 'nolambda':str( $nolambda ), 'slocal': str( $slocal ), 'llocal': str( $llocal ), 'futurefdr':str( $futurefdr ) }
+#if str( $xls_to_interval ) == 'create':
+#set $__options['xls_to_interval'] = { 'peaks_file': str( $output_xls_to_interval_peaks_file ), 'negative_peaks_file': str( $output_xls_to_interval_negative_peaks_file ) }
+#else:
+#set $__options['xls_to_interval'] = False
+#end if
+##treatment/tag input files and format
+#set $__options['input_chipseq'] = [ str( $input_type['input_chipseq_file1'] ) ]
+#if  $input_type['input_type_selector'] == 'paired_end':
+#set $_hole = __options['input_chipseq'].append( str( $input_type['input_chipseq_file2'] ) )
+#set $__options['format'] = 'ELANDMULTIPET'
+#else:
+#set $__options['format'] = $input_type['input_chipseq_file1'].extension.upper()
+#end if
+##control/input files
+#set $__options['input_control'] = []
+#if str( $input_type['input_control_file1'] ) != 'None':
+#set $_hole = __options['input_control'].append( str( $input_type['input_control_file1'] ) )
+#end if
+#if $input_type['input_type_selector'] == 'paired_end' and str( $input_type['input_control_file2'] ) != 'None':
+#set $_hole = __options['input_control'].append( str( $input_type['input_control_file2'] ) )
+#end if
+##wig options
+#if $wig_type['wig_type_selector'] == 'wig':
+#set $__options['wig'] = {}
+#set $__options['wig']['space'] = str( $wig_type['space'] )
+#set  $__options['wig']['output_treatment_file'] = str( $output_treatment_wig_file )
+#if $input_type['input_control_file1'] is not None:
+#set  $__options['wig']['output_control_file'] = str( $output_control_wig_file )
+#end if
+#end if
+##model options
+#if $nomodel_type['nomodel_type_selector'] == 'nomodel':
+#set $__options['nomodel'] = str( $nomodel_type['shiftsize'] )
+#end if
+##diag options
+#if $diag_type['diag_type_selector'] == 'diag':
+#set $__options['diag'] = { 'fe-min':str( $diag_type['fe-min'] ), 'fe-max':str( $diag_type['fe-max'] ), 'fe-step':str( $diag_type['fe-step'] ) }
+#end if
+${ simplejson.dumps( __options ) }
+    </configfile>
+  </configfiles>
+  <tests>
+    <test>
+      <param name="input_type_selector" value="single_end" />
+      <param name="input_chipseq_file1" value="chipseq_enriched.bed.gz" ftype="bed" />
+      <param name="input_control_file1" value="chipseq_input.bed.gz" ftype="bed" />
+      <param name="experiment_name" value="Galaxy Test Run" />
+      <param name="tsize" value="36" />
+      <param name="mfold" value="10,30" />
+      <param name="gsize" value="2.7e+9" />
+      <param name="bw" value="300" />
+      <param name="pvalue" value="1e-5" />
+      <param name="xls_to_interval" />
+      <param name="wig_type_selector" value="no_wig" />
+      <param name="nolambda"/>
+      <param name="slocal" value="1000"/>
+      <param name="llocal" value="10000"/>
+      <param name="nomodel_type_selector" value="create_model" />
+      <param name="diag_type_selector" value="no_diag" />
+      <param name="futurefdr"/>
+      <output name="output_bed_file" file="peakcalling_macs/macs_test_1_out.bed" />
+      <output name="output_html_file" file="peakcalling_macs/macs_test_1_out.html" compare="re_match" >
+        <extra_files type="file" name="Galaxy_Test_Run_model.pdf" value="peakcalling_macs/test2/Galaxy_Test_Run_model.pdf" compare="re_match"/>
+        <extra_files type="file" name="Galaxy_Test_Run_model.r" value="peakcalling_macs/test2/Galaxy_Test_Run_model.r" compare="re_match"/>
+        <extra_files type="file" name="Galaxy_Test_Run_model.r.log" value="peakcalling_macs/test2/Galaxy_Test_Run_model.r.log"/>
+        <extra_files type="file" name="Galaxy_Test_Run_negative_peaks.xls" value="peakcalling_macs/test2/Galaxy_Test_Run_negative_peaks.xls" compare="re_match"/>
+        <extra_files type="file" name="Galaxy_Test_Run_peaks.xls" value="peakcalling_macs/test2/Galaxy_Test_Run_peaks.xls" compare="re_match"/>
+      </output>
+    </test>
+    <test>
+      <param name="input_type_selector" value="single_end" />
+      <param name="input_chipseq_file1" value="chipseq_enriched.bed.gz" ftype="bed" />
+      <param name="input_control_file1" value="chipseq_input.bed.gz" ftype="bed" />
+      <param name="experiment_name" value="Galaxy Test Run" />
+      <param name="tsize" value="36" />
+      <param name="mfold" value="10,30" />
+      <param name="gsize" value="2.7e+9" />
+      <param name="bw" value="300" />
+      <param name="pvalue" value="1e-5" />
+      <param name="xls_to_interval" value="true" />
+      <param name="wig_type_selector" value="no_wig" />
+      <param name="nolambda"/>
+      <param name="slocal" value="1000"/>
+      <param name="llocal" value="10000"/>
+      <param name="nomodel_type_selector" value="create_model" />
+      <param name="diag_type_selector" value="no_diag" />
+      <param name="futurefdr"/>
+      <output name="output_bed_file" file="peakcalling_macs/macs_test_1_out.bed" />
+      <output name="output_xls_to_interval_peaks_file" file="peakcalling_macs/macs_test_2_peaks_out.interval" lines_diff="4" />
+      <output name="output_xls_to_interval_negative_peaks_file" file="peakcalling_macs/macs_test_2_neg_peaks_out.interval" />
+      <output name="output_html_file" file="peakcalling_macs/macs_test_1_out.html" compare="re_match" >
+        <extra_files type="directory" value="peakcalling_macs/test2/" compare="re_match"/>
+      </output>
+    </test>
+    <!-- <test>
+      <param name="input_type_selector" value="single_end" />
+      <param name="input_chipseq_file1" value="chipseq_enriched.bed.gz" ftype="bed" />
+      <param name="input_control_file1" value="chipseq_input.bed.gz" ftype="bed" />
+      <param name="experiment_name" value="Galaxy Test Run" />
+      <param name="tsize" value="36" />
+      <param name="mfold" value="13" />
+      <param name="gsize" value="2.7e+9" />
+      <param name="bw" value="300" />
+      <param name="pvalue" value="1e-5" />
+      <param name="xls_to_interval" value="true" />
+      <param name="wig_type_selector" value="wig" />
+      <param name="wigextend" value="-1" />
+      <param name="space" value="10" />
+      <param name="nolambda"/>
+      <param name="lambdaset" value="1000,5000,10000"/>
+      <param name="nomodel_type_selector" value="create_model" />
+      <param name="diag_type_selector" value="no_diag" />
+      <param name="futurefdr"/>
+      <output name="output_bed_file" file="peakcalling_macs/macs_test_1_out.bed" />
+      <output name="output_xls_to_interval_peaks_file" file="peakcalling_macs/macs_test_2_peaks_out.interval" lines_diff="4" />
+      <output name="output_xls_to_interval_negative_peaks_file" file="macs_test_2_neg_peaks_out.interval" />
+      <output name="output_treatment_wig_file" file="peakcalling_macs/macs_test_3_treatment_out.wig" />
+      <output name="output_control_wig_file" file="peakcalling_macs/macs_test_3_control_out.wig" />
+      <output name="output_html_file" file="peakcalling_macs/macs_test_3_out.html" compare="re_match" >
+        <extra_files type="directory" value="peakcalling_macs/test2/" compare="re_match"/>
+      </output>
+    </test> -->
+  </tests>
+  <help>
+**What it does**
+
+This tool allows ChIP-seq peak calling using MACS.
+
+Depending upon selected options, 2 to 6 history items will be created; the first output will be a standard BED file and the last will be an HTML report containing links to download additional files generated by MACS. Up to two each of wig and interval files can be optionally created; the interval files are parsed from the xls output.
+
+View the original MACS documentation: http://liulab.dfci.harvard.edu/MACS/00README.html.
+  </help>
+</tool>