retrieve_ensembl_bed: retrieve_ensembl

comparison retrieve_ensembl_bed.py @ 0:da1b538b87e5 draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/proteogenomics/retrieve_ensembl_bed commit 88cf1e923a8c9e5bc6953ad412d15a7c70f054d1

author	galaxyp
date	Mon, 22 Jan 2018 13:13:47 -0500
parents
children	9c4a48f5d4e7

comparison

equal deleted inserted replaced

--1:000000000000
+:da1b538b87e5
+#!/usr/bin/env python
+"""
+#
+#------------------------------------------------------------------------------
+#                         University of Minnesota
+#         Copyright 2017, Regents of the University of Minnesota
+#------------------------------------------------------------------------------
+# Author:
+#
+#  James E Johnson
+#
+#------------------------------------------------------------------------------
+"""
+from __future__ import print_function
+import argparse
+import re
+import sys
+from bedutil import bed_from_line
+from ensembl_rest import get_toplevel, get_transcripts_bed, max_region
+def __main__():
+parser = argparse.ArgumentParser(
+description='Retrieve Ensembl cDNAs in BED format')
+parser.add_argument(
+'-s', '--species', default='human',
+help='Ensembl Species to retrieve')
+parser.add_argument(
+'-R', '--regions', action='append', default=[],
+help='Restrict Ensembl retrieval to regions e.g.:'
++ ' X,2:20000-25000,3:100-500+')
+parser.add_argument(
+'-B', '--biotypes', action='append', default=[],
+help='Restrict Ensembl biotypes to retrieve')
+parser.add_argument(
+'-X', '--extended_bed', action='store_true', default=False,
+help='Include the extended columns returned from Ensembl')
+parser.add_argument(
+'-U', '--ucsc_chrom_names', action='store_true', default=False,
+help='Use the UCSC names for Chromosomes')
+parser.add_argument(
+'-t', '--toplevel', action='store_true',
+help='Print Ensembl toplevel for species')
+parser.add_argument(
+'output',
+help='Output BED filepath, or for stdout: "-"')
+parser.add_argument('-v', '--verbose', action='store_true', help='Verbose')
+parser.add_argument('-d', '--debug', action='store_true', help='Debug')
+args = parser.parse_args()
+species = args.species
+out_wtr = open(args.output, 'w') if args.output != '-' else sys.stdout
+biotypes = ';'.join(['biotype=%s' % bt.strip()
+for biotype in args.biotypes
+for bt in biotype.split(',') if bt.strip()])
+selected_regions = dict()  # chrom:(start, end)
+region_pat = '^([^:]+)(?::(\d*)(?:-(\d+)([+-])?)?)?'
+if args.regions:
+for entry in args.regions:
+if not entry:
+continue
+regs = [x.strip() for x in entry.split(',') if x.strip()]
+for reg in regs:
+m = re.match(region_pat, reg)
+if m:
+(chrom, start, end, strand) = m.groups()
+if chrom:
+if chrom not in selected_regions:
+selected_regions[chrom] = []
+selected_regions[chrom].append([start, end, strand])
+if args.debug:
+print("selected_regions: %s" % selected_regions, file=sys.stderr)
+def retrieve_region(species, ref, start, stop, strand):
+transcript_count = 0
+regions = list(range(start, stop, max_region))
+if not regions or regions[-1] < stop:
+regions.append(stop)
+for end in regions[1:]:
+bedlines = get_transcripts_bed(species, ref, start, end,
+strand=strand, params=biotypes)
+if args.debug:
+print("%s\t%s\tstart: %d\tend: %d\tcDNA transcripts:%d" %
+(species, ref, start, end, len(bedlines)),
+file=sys.stderr)
+# start, end, seq
+for i, bedline in enumerate(bedlines):
+if args.debug:
+print("%s\n" % (bedline), file=sys.stderr)
+if not args.ucsc_chrom_names:
+bedline = re.sub('^[^\t]+', ref, bedline)
+try:
+if out_wtr:
+out_wtr.write(bedline.replace(',\t', '\t')
+if args.extended_bed
+else str(bed_from_line(bedline)))
+out_wtr.write("\n")
+out_wtr.flush()
+except Exception as e:
+print("BED error (%s) : %s\n" % (e, bedline),
+file=sys.stderr)
+start = end + 1
+return transcript_count
+coord_systems = get_toplevel(species)
+if 'chromosome' in coord_systems:
+ref_lengths = dict()
+for ref in sorted(coord_systems['chromosome'].keys()):
+length = coord_systems['chromosome'][ref]
+ref_lengths[ref] = length
+if args.toplevel:
+print("%s\t%s\tlength: %d" % (species, ref, length),
+file=sys.stderr)
+if selected_regions:
+transcript_count = 0
+for ref in sorted(selected_regions.keys()):
+if ref in ref_lengths:
+for reg in selected_regions[ref]:
+(_start, _stop, _strand) = reg
+start = int(_start) if _start else 0
+stop = int(_stop) if _stop else ref_lengths[ref]
+strand = '' if not _strand else ':1'\
+if _strand == '+' else ':-1'
+transcript_count += retrieve_region(species, ref,
+start, stop,
+strand)
+if args.debug or args.verbose:
+length = stop - start
+print("%s\t%s:%d-%d%s\tlength: %d\ttrancripts:%d" %
+(species, ref, start, stop, strand,
+length, transcript_count),
+file=sys.stderr)
+else:
+strand = ''
+start = 0
+for ref in sorted(ref_lengths.keys()):
+length = ref_lengths[ref]
+transcript_count = 0
+if args.debug:
+print("Retrieving transcripts: %s\t%s\tlength: %d" %
+(species, ref, length), file=sys.stderr)
+transcript_count += retrieve_region(species, ref, start,
+length, strand)
+if args.debug or args.verbose:
+print("%s\t%s\tlength: %d\ttrancripts:%d" %
+(species, ref, length, transcript_count),
+file=sys.stderr)
+if __name__ == "__main__":
+__main__()

Mercurial > repos > galaxyp > retrieve_ensembl_bed

comparison retrieve_ensembl_bed.py @ 0:da1b538b87e5 draft