Mercurial > repos > devteam > intersect
changeset 5:33b3f3688db4 draft
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tool_collections/gops/intersect commit cae3e05d02e60f595bb8b6d77a84f030e9bd1689
author | devteam |
---|---|
date | Thu, 22 Jun 2017 18:52:23 -0400 |
parents | 8ddabc73af92 |
children | 69c10b56f46d |
files | gops_intersect.py intersect.xml macros.xml operation_filter.py tool_dependencies.xml utils/__init__.pyc utils/gff_util.py utils/gff_util.pyc utils/odict.pyc |
diffstat | 9 files changed, 181 insertions(+), 177 deletions(-) [+] |
line wrap: on
line diff
--- a/gops_intersect.py Thu Feb 11 12:11:25 2016 -0500 +++ b/gops_intersect.py Thu Jun 22 18:52:23 2017 -0400 @@ -11,14 +11,18 @@ -G, --gff1: input 1 is GFF format, meaning start and end coordinates are 1-based, closed interval -H, --gff2: input 2 is GFF format, meaning start and end coordinates are 1-based, closed interval """ +from __future__ import print_function + import fileinput import sys + +from bx.cookbook import doc_optparse from bx.intervals.io import GenomicInterval, NiceReaderWrapper from bx.intervals.operations.intersect import intersect -from bx.cookbook import doc_optparse from bx.tabular.io import ParseError from galaxy.tools.util.galaxyops import fail, parse_cols_arg, skipped -from utils.gff_util import GFFFeature, GFFReaderWrapper, convert_bed_coords_to_gff + +from utils.gff_util import convert_bed_coords_to_gff, GFFFeature, GFFReaderWrapper assert sys.version_info[:2] >= ( 2, 4 ) @@ -80,16 +84,17 @@ out_file.write( "%s\n" % "\t".join( feature.fields ) ) else: out_file.write( "%s\n" % feature ) - except ParseError, e: + except ParseError as e: out_file.close() fail( "Invalid file format: %s" % str( e ) ) out_file.close() if g1.skipped > 0: - print skipped( g1, filedesc=" of 1st dataset" ) + print(skipped( g1, filedesc=" of 1st dataset" )) if g2.skipped > 0: - print skipped( g2, filedesc=" of 2nd dataset" ) + print(skipped( g2, filedesc=" of 2nd dataset" )) + if __name__ == "__main__": main()
--- a/intersect.xml Thu Feb 11 12:11:25 2016 -0500 +++ b/intersect.xml Thu Jun 22 18:52:23 2017 -0400 @@ -1,147 +1,132 @@ -<tool id="gops_intersect_1" name="Intersect" version="1.0.0"> - <description>the intervals of two datasets</description> - <requirements> - <requirement type="package" version="0.7.1">bx-python</requirement> - <requirement type="package" version="1.0.0">galaxy-ops</requirement> - </requirements> - <command interpreter="python">gops_intersect.py - $input1 $input2 $output - - #if isinstance( $input1.datatype, $__app__.datatypes_registry.get_datatype_by_extension('gff').__class__): - -1 1,4,5,7 --gff1 - #else: - -1 ${input1.metadata.chromCol},${input1.metadata.startCol},${input1.metadata.endCol},${input1.metadata.strandCol} - #end if - - #if isinstance( $input2.datatype, $__app__.datatypes_registry.get_datatype_by_extension('gff').__class__): - -2 1,4,5,7 --gff2 - #else: - -2 ${input2.metadata.chromCol},${input2.metadata.startCol},${input2.metadata.endCol},${input2.metadata.strandCol} - #end if - - -m $min $returntype - </command> - <inputs> - <param name="returntype" type="select" label="Return" help="(see figure below)"> - <option value="">Overlapping Intervals</option> - <option value="-p">Overlapping pieces of Intervals</option> - </param> - <param format="interval,gff" name="input1" type="data" help="First dataset"> - <label>of</label> - </param> - <param format="interval,gff" name="input2" type="data" help="Second dataset"> - <label>that intersect</label> - </param> - <param name="min" type="integer" value="1" min="1" help="(bp)"> - <label>for at least</label> - </param> - </inputs> - <outputs> - <data format="input" name="output" metadata_source="input1"/> - </outputs> - <code file="operation_filter.py"/> - <trackster_conf/> - <tests> - <test> - <param name="input1" value="1.bed" /> - <param name="input2" value="2.bed" /> - <param name="min" value="1" /> - <param name="returntype" value="" /> - <output name="output" file="gops_intersect_out.bed" /> - </test> - <test> - <param name="input1" value="1.bed" /> - <param name="input2" value="2_mod.bed" ftype="interval"/> - <param name="min" value="1" /> - <param name="returntype" value="" /> - <output name="output" file="gops_intersect_diffCols.bed" /> - </test> - <test> - <param name="input1" value="1.bed" /> - <param name="input2" value="2_mod.bed" ftype="interval"/> - <param name="min" value="1" /> - <param name="returntype" value="Overlapping pieces of Intervals" /> - <output name="output" file="gops_intersect_p_diffCols.bed" /> - </test> - <test> - <param name="input1" value="1.bed" /> - <param name="input2" value="2.bed" /> - <param name="min" value="10" /> - <param name="returntype" value="Overlapping pieces of Intervals" /> - <output name="output" file="gops_intersect_p_out.bed" /> - </test> - <test> - <param name="input1" value="gops_bigint.interval" ftype="interval" /> - <param name="input2" value="gops_bigint2.interval" ftype="interval" /> - <param name="min" value="1" /> - <param name="returntype" value="" /> - <output name="output" file="gops_intersect_bigint_out.interval" /> - </test> - <test> - <param name="input1" value="gops_bigint2.interval" ftype="interval" /> - <param name="input2" value="gops_bigint.interval" ftype="interval" /> - <param name="min" value="1" /> - <param name="returntype" value="" /> - <output name="output" file="gops_intersect_bigint_out.interval" /> - </test> - <test> - <param name="input1" value="12.bed" ftype="bed" /> - <param name="input2" value="1.bed" ftype="bed" /> - <param name="min" value="1" /> - <param name="returntype" value="" /> - <output name="output" file="gops_intersect_no_strand_out.bed" /> - </test> - <!-- Intersect two GFF files. --> - <test> - <param name="input1" value="gops_subtract_in1.gff" /> - <param name="input2" value="gops_subtract_in2.gff" /> - <param name="min" value="1" /> - <param name="returntype" value="" /> - <output name="output" file="gops_intersect_out2.gff" /> - </test> - <!-- Intersect GFF file and bed file. --> - <test> - <param name="input1" value="gops_subtract_in1.gff" /> - <param name="input2" value="gops_subtract_in2.bed" /> - <param name="min" value="1" /> - <param name="returntype" value="" /> - <output name="output" file="gops_intersect_out2.gff" /> - </test> - - </tests> - <help> - -.. class:: infomark - -**TIP:** If your dataset does not appear in the pulldown menu, it means that it is not in interval format. Use "edit attributes" to set chromosome, start, end, and strand columns. - ------ - -**Screencasts!** - -See Galaxy Interval Operation Screencasts_ (right click to open this link in another window). - -.. _Screencasts: http://wiki.g2.bx.psu.edu/Learn/Interval%20Operations - ------ - -**Syntax** - -- **Where overlap is at least** sets the minimum length (in base pairs) of overlap between elements of the two datasets -- **Overlapping Intervals** returns entire intervals from the first dataset that overlap the second dataset. The returned intervals are completely unchanged, and this option only filters out intervals that do not overlap with the second dataset. -- **Overlapping pieces of Intervals** returns intervals that indicate the exact base pair overlap between the first dataset and the second dataset. The intervals returned are from the first dataset, and all fields besides start and end are guaranteed to remain unchanged. - ------ - -**Examples** - -Overlapping Intervals: - -.. image:: gops_intersectOverlappingIntervals.gif - -Overlapping Pieces of Intervals: - -.. image:: gops_intersectOverlappingPieces.gif - -</help> -</tool> +<tool id="gops_intersect_1" name="Intersect" version="1.0.0"> + <description>the intervals of two datasets</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <code file="operation_filter.py"/> + <command><![CDATA[ +python '$__tool_directory__/gops_intersect.py' +'$input1' +'$input2' +'$output' + +#if $input1.is_of_type('gff') + -1 1,4,5,7 --gff1 +#else: + -1 ${input1.metadata.chromCol},${input1.metadata.startCol},${input1.metadata.endCol},${input1.metadata.strandCol} +#end if + +#if $input2.is_of_type('gff') + -2 1,4,5,7 --gff2 +#else: + -2 ${input2.metadata.chromCol},${input2.metadata.startCol},${input2.metadata.endCol},${input2.metadata.strandCol} +#end if + +-m $min $returntype + ]]></command> + <inputs> + <param name="returntype" type="select" label="Return" help="(see figure below)"> + <option value="">Overlapping Intervals</option> + <option value="-p">Overlapping pieces of Intervals</option> + </param> + <param name="input1" type="data" format="interval,gff" label="of" help="First dataset" /> + <param name="input2" type="data" format="interval,gff" label="that intersect" help="Second dataset" /> + <param name="min" type="integer" value="1" min="1" label="for at least" help="(bp)" /> + </inputs> + <outputs> + <data name="output" format_source="input1" metadata_source="input1"/> + </outputs> + <tests> + <test> + <param name="input1" value="1.bed" /> + <param name="input2" value="2.bed" /> + <param name="min" value="1" /> + <param name="returntype" value="" /> + <output name="output" file="gops_intersect_out.bed" /> + </test> + <test> + <param name="input1" value="1.bed" /> + <param name="input2" value="2_mod.bed" ftype="interval"/> + <param name="min" value="1" /> + <param name="returntype" value="" /> + <output name="output" file="gops_intersect_diffCols.bed" /> + </test> + <test> + <param name="input1" value="1.bed" /> + <param name="input2" value="2_mod.bed" ftype="interval"/> + <param name="min" value="1" /> + <param name="returntype" value="Overlapping pieces of Intervals" /> + <output name="output" file="gops_intersect_p_diffCols.bed" /> + </test> + <test> + <param name="input1" value="1.bed" /> + <param name="input2" value="2.bed" /> + <param name="min" value="10" /> + <param name="returntype" value="Overlapping pieces of Intervals" /> + <output name="output" file="gops_intersect_p_out.bed" /> + </test> + <test> + <param name="input1" value="gops_bigint.interval" ftype="interval" /> + <param name="input2" value="gops_bigint2.interval" ftype="interval" /> + <param name="min" value="1" /> + <param name="returntype" value="" /> + <output name="output" file="gops_intersect_bigint_out.interval" /> + </test> + <test> + <param name="input1" value="gops_bigint2.interval" ftype="interval" /> + <param name="input2" value="gops_bigint.interval" ftype="interval" /> + <param name="min" value="1" /> + <param name="returntype" value="" /> + <output name="output" file="gops_intersect_bigint_out.interval" /> + </test> + <test> + <param name="input1" value="12.bed" ftype="bed" /> + <param name="input2" value="1.bed" ftype="bed" /> + <param name="min" value="1" /> + <param name="returntype" value="" /> + <output name="output" file="gops_intersect_no_strand_out.bed" /> + </test> + <!-- Intersect two GFF files. --> + <test> + <param name="input1" value="gops_subtract_in1.gff" /> + <param name="input2" value="gops_subtract_in2.gff" /> + <param name="min" value="1" /> + <param name="returntype" value="" /> + <output name="output" file="gops_intersect_out2.gff" /> + </test> + <!-- Intersect GFF file and bed file. --> + <test> + <param name="input1" value="gops_subtract_in1.gff" /> + <param name="input2" value="gops_subtract_in2.bed" /> + <param name="min" value="1" /> + <param name="returntype" value="" /> + <output name="output" file="gops_intersect_out2.gff" /> + </test> + </tests> + <help><![CDATA[ +.. class:: infomark + +**TIP:** If your dataset does not appear in the pulldown menu, it means that it is not in interval format. Use "edit attributes" to set chromosome, start, end, and strand columns. + +@SCREENCASTS@ + +**Syntax** + +- **Where overlap is at least** sets the minimum length (in base pairs) of overlap between elements of the two datasets +- **Overlapping Intervals** returns entire intervals from the first dataset that overlap the second dataset. The returned intervals are completely unchanged, and this option only filters out intervals that do not overlap with the second dataset. +- **Overlapping pieces of Intervals** returns intervals that indicate the exact base pair overlap between the first dataset and the second dataset. The intervals returned are from the first dataset, and all fields besides start and end are guaranteed to remain unchanged. + +----- + +**Examples** + +Overlapping Intervals: + +.. image:: gops_intersectOverlappingIntervals.gif + +Overlapping Pieces of Intervals: + +.. image:: gops_intersectOverlappingPieces.gif + ]]></help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Thu Jun 22 18:52:23 2017 -0400 @@ -0,0 +1,20 @@ +<?xml version="1.0"?> +<macros> + <xml name="requirements"> + <requirements> + <requirement type="package" version="0.7.1">bx-python</requirement> + <requirement type="package" version="1.0.0">galaxy-ops</requirement> + </requirements> + </xml> + <token name="@SCREENCASTS@"> +----- + +**Screencasts!** + +See Galaxy Interval Operation Screencasts_ (right click to open this link in another window). + +.. _Screencasts: https://galaxyproject.org/learn/interval-operations/ + +----- + </token> +</macros>
--- a/operation_filter.py Thu Feb 11 12:11:25 2016 -0500 +++ b/operation_filter.py Thu Jun 22 18:52:23 2017 -0400 @@ -1,8 +1,7 @@ # runs after the job (and after the default post-filter) +from galaxy.jobs.handler import JOB_ERROR from galaxy.tools.parameters import DataToolParameter -from galaxy.jobs.handler import JOB_ERROR - # Older py compatibility try: set() @@ -14,7 +13,7 @@ dbkeys = set() data_param_names = set() data_params = 0 - for name, param in page_param_map.iteritems(): + for name, param in page_param_map.items(): if isinstance( param, DataToolParameter ): # for each dataset parameter if param_values.get(name, None) is not None: @@ -53,7 +52,6 @@ try: if stderr and len( stderr ) > 0: raise Exception( stderr ) - except Exception: data.blurb = JOB_ERROR data.state = JOB_ERROR
--- a/tool_dependencies.xml Thu Feb 11 12:11:25 2016 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,9 +0,0 @@ -<?xml version="1.0"?> -<tool_dependency> - <package name="bx-python" version="0.7.1"> - <repository changeset_revision="2d0c08728bca" name="package_bx_python_0_7" owner="devteam" toolshed="https://toolshed.g2.bx.psu.edu" /> - </package> - <package name="galaxy-ops" version="1.0.0"> - <repository changeset_revision="eef263ff9b95" name="package_galaxy_ops_1_0_0" owner="devteam" toolshed="https://toolshed.g2.bx.psu.edu" /> - </package> -</tool_dependency>
--- a/utils/gff_util.py Thu Feb 11 12:11:25 2016 -0500 +++ b/utils/gff_util.py Thu Jun 22 18:52:23 2017 -0400 @@ -1,11 +1,12 @@ """ Provides utilities for working with GFF files. """ +import copy -import copy from bx.intervals.io import GenomicInterval, GenomicIntervalReader, MissingFieldError, NiceReaderWrapper -from bx.tabular.io import Header, Comment, ParseError -from utils.odict import odict +from bx.tabular.io import Comment, Header, ParseError + +from .odict import odict class GFFInterval( GenomicInterval ): @@ -144,7 +145,7 @@ self.default_strand, fix_strand=self.fix_strand ) return interval - def next( self ): + def __next__( self ): """ Returns next GFFFeature. """ # @@ -177,10 +178,10 @@ while not self.seed_interval: try: self.seed_interval = GenomicIntervalReader.next( self ) - except ParseError, e: + except ParseError as e: handle_parse_error( e ) # TODO: When no longer supporting python 2.4 use finally: - #finally: + # finally: raw_size += len( self.current_line ) # If header or comment, clear seed interval and return it with its size. @@ -205,19 +206,19 @@ try: interval = GenomicIntervalReader.next( self ) raw_size += len( self.current_line ) - except StopIteration, e: + except StopIteration as e: # No more intervals to read, but last feature needs to be # returned. interval = None raw_size += len( self.current_line ) break - except ParseError, e: + except ParseError as e: handle_parse_error( e ) raw_size += len( self.current_line ) continue # TODO: When no longer supporting python 2.4 use finally: - #finally: - #raw_size += len( self.current_line ) + # finally: + # raw_size += len( self.current_line ) # Ignore comments. if isinstance( interval, Comment ): @@ -263,6 +264,7 @@ convert_gff_coords_to_bed( feature ) return feature + next = __next__ # This line should be removed once the bx-python port to Python3 is finished def convert_bed_coords_to_gff( interval ): @@ -374,7 +376,9 @@ # -- Get function that generates line/feature key. -- - get_transcript_id = lambda fields: parse_gff_attributes( fields[8] )[ 'transcript_id' ] + def get_transcript_id(fields): + return parse_gff_attributes( fields[8] )[ 'transcript_id' ] + if strict: # Strict GTF parsing uses transcript_id only to group lines into feature. key_fn = get_transcript_id @@ -382,7 +386,8 @@ # Use lenient parsing where chromosome + transcript_id is the key. This allows # transcripts with same ID on different chromosomes; this occurs in some popular # datasources, such as RefGenes in UCSC. - key_fn = lambda fields: fields[0] + '_' + get_transcript_id( fields ) + def key_fn(fields): + return fields[0] + '_' + get_transcript_id( fields ) # Aggregate intervals by transcript_id and collect comments. feature_intervals = odict()