# HG changeset patch # User crs4 # Date 1382119751 14400 # Node ID cd6cc6d767081b4cf7e225d10c7956cb1fe26e88 # Parent 60609a9cef3bec007d88cec09cf5b2e8b7a7c9dd Simplify passing repeated params to Python script. Add more info to help sections. diff -r 60609a9cef3b -r cd6cc6d76708 edena_ass_wrapper.py --- a/edena_ass_wrapper.py Mon Sep 09 05:44:31 2013 -0400 +++ b/edena_ass_wrapper.py Fri Oct 18 14:09:11 2013 -0400 @@ -32,37 +32,16 @@ (options, args) = parser.parse_args() if len(args) > 0: parser.error('Wrong number of arguments') - + # build Edena (assembling) command to be executed ovl_input = '-e %s' % (options.ovl_input) - if options.overlapCutoff is not None: - overlapCutoff = '-m %d' % (options.overlapCutoff) - else: - overlapCutoff = '' - if options.cc: - cc = '-cc yes' - else: - cc = '-cc no' - if options.discardNonUsable: - discardNonUsable = '-discardNonUsable yes' - else: - discardNonUsable = '-discardNonUsable no' - if options.minContigSize is not None: - minContigSize = '-c %d' % (options.minContigSize) - else: - minContigSize = '' - if options.minCoverage is not None: - minCoverage = '-minCoverage %s' % (options.minCoverage) - else: - minCoverage = '' - if options.trim is not None: - trim = '-trim %d' % (options.trim) - else: - trim = '' - if options.peHorizon is not None: - peHorizon = '-peHorizon %d' % (options.peHorizon) - else: - peHorizon = '' + overlapCutoff = '-m %d' % (options.overlapCutoff) if options.overlapCutoff is not None else '' + cc = '-cc yes' if options.cc else '-cc no' + discardNonUsable = '-discardNonUsable yes' if options.discardNonUsable else '-discardNonUsable no' + minContigSize = '-c %d' % (options.minContigSize) if options.minContigSize is not None else '' + minCoverage = '-minCoverage %s' % (options.minCoverage) if options.minCoverage is not None else '' + trim = '-trim %d' % (options.trim) if options.trim is not None else '' + peHorizon = '-peHorizon %d' % (options.peHorizon) if options.peHorizon is not None else '' covStats = options.covStats out_contigs_cov = options.out_contigs_cov out_contigs_fasta = options.out_contigs_fasta @@ -71,20 +50,16 @@ out_nodesInfo = options.out_nodesInfo out_nodesPosition = options.out_nodesPosition logfile = options.logfile - + # Build Edena (assembling) command - cmd1 = '%s %s %s %s %s %s %s %s' % (ovl_input, overlapCutoff, cc, discardNonUsable, minContigSize, minCoverage, trim, peHorizon) - cmd2 = 'edena %s' % ( cmd1 ) - print '\nEdena (assembling) command to be executed: \n %s' % ( cmd2 ) - + cmd = 'edena %s %s %s %s %s %s %s %s' % (ovl_input, overlapCutoff, cc, discardNonUsable, minContigSize, minCoverage, trim, peHorizon) + print '\nEdena (assembling) command to be executed:\n %s' % (cmd) + # Execution of Edena print 'Executing Edena (assembling)...' - if logfile: - log = open(logfile, 'w') - else: - log = sys.stdout + log = open(logfile, 'w') if logfile else sys.stdout try: - subprocess.check_call(cmd2, stdout=log, stderr=subprocess.STDOUT, shell=True) # need to redirect stderr because edena writes some logging info there (e.g. "Condensing overlaps graph...") + subprocess.check_call(cmd, stdout=log, stderr=subprocess.STDOUT, shell=True) # need to redirect stderr because edena writes some logging info there (e.g. "Condensing overlaps graph...") finally: if log != sys.stdout: log.close() diff -r 60609a9cef3b -r cd6cc6d76708 edena_ass_wrapper.xml --- a/edena_ass_wrapper.xml Mon Sep 09 05:44:31 2013 -0400 +++ b/edena_ass_wrapper.xml Fri Oct 18 14:09:11 2013 -0400 @@ -66,7 +66,11 @@ **What it does** -The key parameter for this mode is the overlaps size cutoff (option –m). By default it is set to half of the reads length, which is quite conservative. If your sequencing project is well covered (>50-100x) you may try increasing a bit this value. The minCoverage is an important parameter which is automatically determined. You may check this value in the program output and possibly override it. +Edena is an overlaps graph based short reads assembler and is suited to Illumina GA reads. An assembly with Edena is a two step process: overlapping and assembling. + +In the assembling step, the overlapping file (produced in the previous step) is provided to the program, as well as some assembly parameters. A set of contigs in FASTA format is outputted. The purpose of having a two step process is that the overlapping file is computed only once and can then be used to produce assemblies with different parameters. + +The key parameter for this step is the overlaps size cutoff (option –m). By default it is set to half of the reads length, which is quite conservative. If your sequencing project is well covered (>50-100x) you may try increasing a bit this value. The minCoverage is an important parameter which is automatically determined. You may check this value in the program output and possibly override it. **License and citation** diff -r 60609a9cef3b -r cd6cc6d76708 edena_ovl_wrapper.py --- a/edena_ovl_wrapper.py Mon Sep 09 05:44:31 2013 -0400 +++ b/edena_ovl_wrapper.py Fri Oct 18 14:09:11 2013 -0400 @@ -13,11 +13,11 @@ # load arguments print 'Parsing Edena (overlapping) input options...' parser = optparse.OptionParser() - parser.add_option('--unpaired_input', dest='unpaired_input', help='') - parser.add_option('--dr_pair_1', dest='dr_pair_1', help='') - parser.add_option('--dr_pair_2', dest='dr_pair_2', help='') - parser.add_option('--rd_pair_1', dest='rd_pair_1', help='') - parser.add_option('--rd_pair_2', dest='rd_pair_2', help='') + parser.add_option('--unpaired_input', action='append', dest='unpaired_input', help='') + parser.add_option('--dr_pair_1', action='append', dest='dr_pair_1', help='') + parser.add_option('--dr_pair_2', action='append', dest='dr_pair_2', help='') + parser.add_option('--rd_pair_1', action='append', dest='rd_pair_1', help='') + parser.add_option('--rd_pair_2', action='append', dest='rd_pair_2', help='') parser.add_option('--nThreads', dest='nThreads', type='int', help='') parser.add_option('--minOlap', dest='minOlap', type='int', help='') parser.add_option('--readsTruncation', dest='readsTruncation', type='int', help='') @@ -26,71 +26,54 @@ (options, args) = parser.parse_args() if len(args) > 0: parser.error('Wrong number of arguments') - + # build Edena (overlapping) command to be executed # unpaired input(s) if options.unpaired_input: - unpaired_inputs = options.unpaired_input.split('+')[0:-1] unpaired_input = '-r' - for item in unpaired_inputs: + for item in options.unpaired_input: unpaired_input += ' %s' % (item) else: unpaired_input = '' # direct-reverse paired-end files if options.dr_pair_1 and options.dr_pair_2: - dr_pairs_1 = options.dr_pair_1.split('+')[0:-1] - dr_pairs_2 = options.dr_pair_2.split('+')[0:-1] dr_pairs = '-DRpairs' - for i in xrange(len(dr_pairs_1)): - dr_pairs += ' %s %s' % (dr_pairs_1[i], dr_pairs_2[i]) + for i in range(len(options.dr_pair_1)): + dr_pairs += ' %s %s' % (options.dr_pair_1[i], options.dr_pair_2[i]) else: dr_pairs = '' # reverse-direct paired-end files if options.rd_pair_1 and options.rd_pair_2: - rd_pairs_1 = options.rd_pair_1.split('+')[0:-1] - rd_pairs_2 = options.rd_pair_2.split('+')[0:-1] rd_pairs = '-RDpairs' - for i in xrange(len(rd_pairs_1)): - rd_pairs += ' %s %s' % (rd_pairs_1[i], rd_pairs_2[i]) + for i in range(len(options.rd_pair_1)): + rd_pairs += ' %s %s' % (options.rd_pair_1[i], options.rd_pair_2[i]) else: rd_pairs = '' # nThreads - if options.nThreads is not None: - nThreads = '-nThreads %d' % (options.nThreads) - else: - nThreads = '' + nThreads = '-nThreads %d' % (options.nThreads) if options.nThreads is not None else '' # minimum overlap - if options.minOlap is not None: - minOlap = '-M %d' % (options.minOlap) - else: - minOlap = '' + minOlap = '-M %d' % (options.minOlap) if options.minOlap is not None else '' # 3' end reads truncation - if options.readsTruncation is not None: - readsTruncation = '-t %d' % (options.readsTruncation) - else: - readsTruncation = '' + readsTruncation = '-t %d' % (options.readsTruncation) if options.readsTruncation is not None else '' # output file(s) output = options.output logfile = options.logfile - + # Build Edena (overlapping) command cmd = 'edena %s %s %s %s %s %s -p galaxy_output' % (unpaired_input, dr_pairs, rd_pairs, nThreads, minOlap, readsTruncation) - print '\nEdena (overlapping) command to be executed: \n %s' % ( cmd ) - + print '\nEdena (overlapping) command to be executed:\n %s' % (cmd) + # Execution of Edena print 'Executing Edena (overlapping)...' - if logfile: - log = open(logfile, 'w') - else: - log = sys.stdout + log = open(logfile, 'w') if logfile else sys.stdout try: subprocess.check_call(cmd, stdout=log, stderr=subprocess.STDOUT, shell=True) # need to redirect stderr because edena writes some logging info there (e.g. "Computing overlaps >=30...") finally: if log != sys.stdout: log.close() print 'Edena (overlapping) executed!' - - shutil.move( "galaxy_output.ovl", output) + + shutil.move('galaxy_output.ovl', output) if __name__ == "__main__": diff -r 60609a9cef3b -r cd6cc6d76708 edena_ovl_wrapper.xml --- a/edena_ovl_wrapper.xml Mon Sep 09 05:44:31 2013 -0400 +++ b/edena_ovl_wrapper.xml Fri Oct 18 14:09:11 2013 -0400 @@ -8,44 +8,18 @@ edena_ovl_wrapper.py \${EDENA_SITE_OPTIONS:---nThreads 2} #if $input_selection.input == "unpaired_file" - #for $i, $unpaired_file in enumerate( $input_selection.unpaired_input ): - #if $i == 0 - #echo "--unpaired_input=" - #end if - #echo $unpaired_file.unpaired_file - #echo '+' + #for $ui in $input_selection.unpaired_input + --unpaired_input=${ui.unpaired_file} #end for #elif $input_selection.input == "dr_pairs" - #for $i, $dr_pair_1 in enumerate( $input_selection.dr_pairs_input ): - #if $i == 0 - #echo "--dr_pair_1=" - #end if - #echo $dr_pair_1.dr_pair_1 - #echo '+' - #end for - #echo ' ' - #for $i, $dr_pair_2 in enumerate( $input_selection.dr_pairs_input ): - #if $i == 0 - #echo "--dr_pair_2=" - #end if - #echo $dr_pair_2.dr_pair_2 - #echo '+' + #for $dpi in $input_selection.dr_pairs_input + --dr_pair_1=${dpi.dr_pair_1} + --dr_pair_2=${dpi.dr_pair_2} #end for #elif $input_selection.input == "rd_pairs" - #for $i, $rd_pair_1 in enumerate( $input_selection.rd_pairs_input ): - #if $i == 0 - #echo "--rd_pair_1=" - #end if - #echo $rd_pair_1.rd_pair_1 - #echo '+' - #end for - #echo ' ' - #for $i, $rd_pair_2 in enumerate( $input_selection.rd_pairs_input ): - #if $i == 0 - #echo "--rd_pair_2=" - #end if - #echo $rd_pair_2.rd_pair_2 - #echo '+' + #for $rpi in $input_selection.rd_pairs_input + --rd_pair_1=${rpi.rd_pair_1} + --rd_pair_2=${rpi.rd_pair_2} #end for #end if #if str($minOlap) @@ -61,7 +35,7 @@ - + @@ -104,7 +78,13 @@ **What it does** -Edena can accept both unpaired and paired files, FASTQ and FASTA format. Note that for technical reasons, all reads are required to be of the same length. You can however provide the program with different files containing different reads length. In such case, Edena will trim the 3’ ends of the longer reads so that they fit the shorter length. It is however required that reads within each individual file are of the same length (as Illumina GA reads are). By default all overlaps with a minimum size corresponding to half of the reads length are computed. This is quite conservative. Provided enough coverage, this value can be increased (option -M) to reduce the memory requirements. For reads longer than 100bp, you may consider the reads truncation option, which could help in discarding 3’ base calling errors. +Edena is an overlaps graph based short reads assembler and is suited to Illumina GA reads. An assembly with Edena is a two step process: overlapping and assembling. + +In the overlapping step, the reads files are provided to the program which computes the transitively reduced overlaps graph. This structure is then stored together with the sequence reads in the overlapping file. + +Edena can accept both unpaired and paired files, FASTQ and FASTA format. Note that for technical reasons, all reads are required to be of the same length. You can however provide the program with different files containing different reads length. In such case, Edena will trim the 3’ ends of the longer reads so that they fit the shorter length. It is however required that reads within each individual file are of the same length (as Illumina GA reads are). By default all overlaps with a minimum size corresponding to half of the reads length are computed. This is quite conservative. Provided enough coverage, this value can be increased (option -M) to reduce the memory requirements. + +For reads longer than 100bp, you may consider the reads truncation option, which could help in discarding 3’ base calling errors. **License and citation**