Mercurial > repos > galaxyp > retrieve_ensembl_bed
comparison retrieve_ensembl_bed.py @ 0:da1b538b87e5 draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/proteogenomics/retrieve_ensembl_bed commit 88cf1e923a8c9e5bc6953ad412d15a7c70f054d1
| author | galaxyp | 
|---|---|
| date | Mon, 22 Jan 2018 13:13:47 -0500 | 
| parents | |
| children | 9c4a48f5d4e7 | 
   comparison
  equal
  deleted
  inserted
  replaced
| -1:000000000000 | 0:da1b538b87e5 | 
|---|---|
| 1 #!/usr/bin/env python | |
| 2 """ | |
| 3 # | |
| 4 #------------------------------------------------------------------------------ | |
| 5 # University of Minnesota | |
| 6 # Copyright 2017, Regents of the University of Minnesota | |
| 7 #------------------------------------------------------------------------------ | |
| 8 # Author: | |
| 9 # | |
| 10 # James E Johnson | |
| 11 # | |
| 12 #------------------------------------------------------------------------------ | |
| 13 """ | |
| 14 | |
| 15 from __future__ import print_function | |
| 16 | |
| 17 import argparse | |
| 18 import re | |
| 19 import sys | |
| 20 | |
| 21 from bedutil import bed_from_line | |
| 22 | |
| 23 from ensembl_rest import get_toplevel, get_transcripts_bed, max_region | |
| 24 | |
| 25 | |
| 26 def __main__(): | |
| 27 parser = argparse.ArgumentParser( | |
| 28 description='Retrieve Ensembl cDNAs in BED format') | |
| 29 parser.add_argument( | |
| 30 '-s', '--species', default='human', | |
| 31 help='Ensembl Species to retrieve') | |
| 32 parser.add_argument( | |
| 33 '-R', '--regions', action='append', default=[], | |
| 34 help='Restrict Ensembl retrieval to regions e.g.:' | |
| 35 + ' X,2:20000-25000,3:100-500+') | |
| 36 parser.add_argument( | |
| 37 '-B', '--biotypes', action='append', default=[], | |
| 38 help='Restrict Ensembl biotypes to retrieve') | |
| 39 parser.add_argument( | |
| 40 '-X', '--extended_bed', action='store_true', default=False, | |
| 41 help='Include the extended columns returned from Ensembl') | |
| 42 parser.add_argument( | |
| 43 '-U', '--ucsc_chrom_names', action='store_true', default=False, | |
| 44 help='Use the UCSC names for Chromosomes') | |
| 45 parser.add_argument( | |
| 46 '-t', '--toplevel', action='store_true', | |
| 47 help='Print Ensembl toplevel for species') | |
| 48 parser.add_argument( | |
| 49 'output', | |
| 50 help='Output BED filepath, or for stdout: "-"') | |
| 51 parser.add_argument('-v', '--verbose', action='store_true', help='Verbose') | |
| 52 parser.add_argument('-d', '--debug', action='store_true', help='Debug') | |
| 53 args = parser.parse_args() | |
| 54 species = args.species | |
| 55 out_wtr = open(args.output, 'w') if args.output != '-' else sys.stdout | |
| 56 biotypes = ';'.join(['biotype=%s' % bt.strip() | |
| 57 for biotype in args.biotypes | |
| 58 for bt in biotype.split(',') if bt.strip()]) | |
| 59 | |
| 60 selected_regions = dict() # chrom:(start, end) | |
| 61 region_pat = '^([^:]+)(?::(\d*)(?:-(\d+)([+-])?)?)?' | |
| 62 if args.regions: | |
| 63 for entry in args.regions: | |
| 64 if not entry: | |
| 65 continue | |
| 66 regs = [x.strip() for x in entry.split(',') if x.strip()] | |
| 67 for reg in regs: | |
| 68 m = re.match(region_pat, reg) | |
| 69 if m: | |
| 70 (chrom, start, end, strand) = m.groups() | |
| 71 if chrom: | |
| 72 if chrom not in selected_regions: | |
| 73 selected_regions[chrom] = [] | |
| 74 selected_regions[chrom].append([start, end, strand]) | |
| 75 if args.debug: | |
| 76 print("selected_regions: %s" % selected_regions, file=sys.stderr) | |
| 77 | |
| 78 def retrieve_region(species, ref, start, stop, strand): | |
| 79 transcript_count = 0 | |
| 80 regions = list(range(start, stop, max_region)) | |
| 81 if not regions or regions[-1] < stop: | |
| 82 regions.append(stop) | |
| 83 for end in regions[1:]: | |
| 84 bedlines = get_transcripts_bed(species, ref, start, end, | |
| 85 strand=strand, params=biotypes) | |
| 86 if args.debug: | |
| 87 print("%s\t%s\tstart: %d\tend: %d\tcDNA transcripts:%d" % | |
| 88 (species, ref, start, end, len(bedlines)), | |
| 89 file=sys.stderr) | |
| 90 # start, end, seq | |
| 91 for i, bedline in enumerate(bedlines): | |
| 92 if args.debug: | |
| 93 print("%s\n" % (bedline), file=sys.stderr) | |
| 94 if not args.ucsc_chrom_names: | |
| 95 bedline = re.sub('^[^\t]+', ref, bedline) | |
| 96 try: | |
| 97 if out_wtr: | |
| 98 out_wtr.write(bedline.replace(',\t', '\t') | |
| 99 if args.extended_bed | |
| 100 else str(bed_from_line(bedline))) | |
| 101 out_wtr.write("\n") | |
| 102 out_wtr.flush() | |
| 103 except Exception as e: | |
| 104 print("BED error (%s) : %s\n" % (e, bedline), | |
| 105 file=sys.stderr) | |
| 106 start = end + 1 | |
| 107 return transcript_count | |
| 108 | |
| 109 coord_systems = get_toplevel(species) | |
| 110 if 'chromosome' in coord_systems: | |
| 111 ref_lengths = dict() | |
| 112 for ref in sorted(coord_systems['chromosome'].keys()): | |
| 113 length = coord_systems['chromosome'][ref] | |
| 114 ref_lengths[ref] = length | |
| 115 if args.toplevel: | |
| 116 print("%s\t%s\tlength: %d" % (species, ref, length), | |
| 117 file=sys.stderr) | |
| 118 if selected_regions: | |
| 119 transcript_count = 0 | |
| 120 for ref in sorted(selected_regions.keys()): | |
| 121 if ref in ref_lengths: | |
| 122 for reg in selected_regions[ref]: | |
| 123 (_start, _stop, _strand) = reg | |
| 124 start = int(_start) if _start else 0 | |
| 125 stop = int(_stop) if _stop else ref_lengths[ref] | |
| 126 strand = '' if not _strand else ':1'\ | |
| 127 if _strand == '+' else ':-1' | |
| 128 transcript_count += retrieve_region(species, ref, | |
| 129 start, stop, | |
| 130 strand) | |
| 131 if args.debug or args.verbose: | |
| 132 length = stop - start | |
| 133 print("%s\t%s:%d-%d%s\tlength: %d\ttrancripts:%d" % | |
| 134 (species, ref, start, stop, strand, | |
| 135 length, transcript_count), | |
| 136 file=sys.stderr) | |
| 137 else: | |
| 138 strand = '' | |
| 139 start = 0 | |
| 140 for ref in sorted(ref_lengths.keys()): | |
| 141 length = ref_lengths[ref] | |
| 142 transcript_count = 0 | |
| 143 if args.debug: | |
| 144 print("Retrieving transcripts: %s\t%s\tlength: %d" % | |
| 145 (species, ref, length), file=sys.stderr) | |
| 146 transcript_count += retrieve_region(species, ref, start, | |
| 147 length, strand) | |
| 148 if args.debug or args.verbose: | |
| 149 print("%s\t%s\tlength: %d\ttrancripts:%d" % | |
| 150 (species, ref, length, transcript_count), | |
| 151 file=sys.stderr) | |
| 152 | |
| 153 | |
| 154 if __name__ == "__main__": | |
| 155 __main__() | 
