# HG changeset patch # User galaxyp # Date 1570479279 14400 # Node ID 9c4a48f5d4e737ea32830755492b50297b24ae8a # Parent da1b538b87e52f3d48fddb5fe2d1d20d1b284c97 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/proteogenomics/retrieve_ensembl_bed commit 6babd357845126292cb202aaea0f70ff68819525" diff -r da1b538b87e5 -r 9c4a48f5d4e7 retrieve_ensembl_bed.py --- a/retrieve_ensembl_bed.py Mon Jan 22 13:13:47 2018 -0500 +++ b/retrieve_ensembl_bed.py Mon Oct 07 16:14:39 2019 -0400 @@ -34,6 +34,12 @@ help='Restrict Ensembl retrieval to regions e.g.:' + ' X,2:20000-25000,3:100-500+') parser.add_argument( + '-i', '--interval_file', default=None, + help='Regions from a bed, gff, or interval file') + parser.add_argument( + '-f', '--interval_format', choices=['bed','gff','interval'], default='interval', + help='Interval format has TAB-separated columns: Seq, Start, End, Strand') + parser.add_argument( '-B', '--biotypes', action='append', default=[], help='Restrict Ensembl biotypes to retrieve') parser.add_argument( @@ -75,6 +81,27 @@ if args.debug: print("selected_regions: %s" % selected_regions, file=sys.stderr) + if args.interval_file: + pat = r'^(?:chr)?([^\t]+)(?:\t(\d+)(?:\t(\d+)(?:\t([+-])?)?)?)?.*' + if args.interval_format == 'bed': + pat = r'^(?:chr)?([^\t]+)\t(\d+)\t(\d+)(?:(?:\t[^\t]+\t[^\t]+\t)([+-]))?.*' + elif args.interval_format == 'gff': + pat = r'^(?:chr)?([^\t]+)\t(\d+)\t(\d+)(?:(?:\t[^\t]+\t[^\t]+\t)([+-]))?.*' + with open(args.interval_file,'r') as fh: + for i, line in enumerate(fh): + if line.startswith('#'): + continue + m = re.match(pat, line.rstrip()) + if m: + (chrom, start, end, strand) = m.groups() + if chrom: + if chrom not in selected_regions: + selected_regions[chrom] = [] + selected_regions[chrom].append([start, end, strand]) + if args.debug: + print("selected_regions: %s" % selected_regions, file=sys.stderr) + + def retrieve_region(species, ref, start, stop, strand): transcript_count = 0 regions = list(range(start, stop, max_region)) diff -r da1b538b87e5 -r 9c4a48f5d4e7 retrieve_ensembl_bed.xml --- a/retrieve_ensembl_bed.xml Mon Jan 22 13:13:47 2018 -0500 +++ b/retrieve_ensembl_bed.xml Mon Oct 07 16:14:39 2019 -0400 @@ -20,6 +20,16 @@ #if $regions: --regions '$regions' #end if + #if $interval_file: + #if $interval_file.ext.find('bed') > -1 + --interval_format bed + #elif $interval_file.ext in ['gff','gtf','gff3'] + --interval_format gff + #else + --interval_format interval + #end if + --interval_file '$interval_file' + #end if '$transcript_bed' ]]> @@ -40,6 +50,7 @@ Each region is specifed as: chr or chr:pos or chr:from-to ^(\w+(:\d+(-\d+)?)?(,\w+(:\d+(-\d+)?)?)*)?$ + @@ -60,6 +71,16 @@ + + + + + + + + + +