Mercurial > repos > lldelisle > fromgtftobed12
comparison fromgtfTobed12.py @ 0:418e4d0fe0bd draft
planemo upload for repository https://github.com/lldelisle/tools-lldelisle/tree/master/tools/fromgtfTobed12 commit 1aaffda5b95e0389e315179345642c0d005867c1
author | lldelisle |
---|---|
date | Fri, 04 Nov 2022 15:37:12 +0000 |
parents | |
children | 6fd4b3b90220 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:418e4d0fe0bd |
---|---|
1 import argparse | |
2 import sys | |
3 import warnings | |
4 | |
5 import gffutils | |
6 | |
7 warnings.filterwarnings("ignore", message="It appears you have a gene feature" | |
8 " in your GTF file. You may want to use the " | |
9 "`disable_infer_genes` option to speed up database " | |
10 "creation") | |
11 warnings.filterwarnings("ignore", message="It appears you have a transcript " | |
12 "feature in your GTF file. You may want to use the " | |
13 "`disable_infer_transcripts` option to speed up " | |
14 "database creation") | |
15 # In gffutils v0.10 they changed the error message: | |
16 warnings.filterwarnings("ignore", message="It appears you have a gene feature" | |
17 " in your GTF file. You may want to use the " | |
18 "`disable_infer_genes=True` option to speed up " | |
19 "database creation") | |
20 warnings.filterwarnings("ignore", message="It appears you have a transcript " | |
21 "feature in your GTF file. You may want to use the " | |
22 "`disable_infer_transcripts=True` option to speed up " | |
23 "database creation") | |
24 | |
25 | |
26 def convert_gtf_to_bed(fn, fo, useGene, mergeTranscripts, | |
27 mergeTranscriptsAndOverlappingExons, ucsc): | |
28 db = gffutils.create_db(fn, ':memory:') | |
29 # For each transcript: | |
30 prefered_name = "transcript_name" | |
31 if useGene or mergeTranscripts or mergeTranscriptsAndOverlappingExons: | |
32 prefered_name = "gene_name" | |
33 if mergeTranscripts or mergeTranscriptsAndOverlappingExons: | |
34 all_items = db.features_of_type("gene", order_by='start') | |
35 else: | |
36 all_items = db.features_of_type("transcript", order_by='start') | |
37 for tr in all_items: | |
38 # The name would be the name of the transcript/gene if exists | |
39 try: | |
40 # First try to have it directly on the feature | |
41 trName = tr.attributes[prefered_name][0] | |
42 except KeyError: | |
43 # Else try to guess the name of the transcript/gene from exons: | |
44 try: | |
45 trName = set([e.attributes[prefered_name][0] | |
46 for e in | |
47 db.children(tr, | |
48 featuretype='exon', | |
49 order_by='start')]).pop() | |
50 except KeyError: | |
51 # Else take the transcript id | |
52 trName = tr.id | |
53 # If the cds is defined in the gtf, | |
54 # use it to define the thick start and end | |
55 # The gtf is 1-based closed intervalls and | |
56 # bed are 0-based half-open so: | |
57 # I need to remove one from each start | |
58 try: | |
59 # In case of multiple CDS (when there is one entry per gene) | |
60 # I use the first one to get the start | |
61 # and the last one to get the end (order_by=-start) | |
62 cds_start = next(db.children(tr, | |
63 featuretype='CDS', | |
64 order_by='start')).start - 1 | |
65 cds_end = next(db.children(tr, | |
66 featuretype='CDS', | |
67 order_by='-start')).end | |
68 except StopIteration: | |
69 # If the CDS is not defined, then it is set to the start | |
70 # as proposed here: | |
71 # https://genome.ucsc.edu/FAQ/FAQformat.html#format1 | |
72 cds_start = tr.start - 1 | |
73 cds_end = tr.start - 1 | |
74 # Get all exons starts and lengths | |
75 if mergeTranscriptsAndOverlappingExons: | |
76 # We merge overlapping exons: | |
77 exons_starts = [] | |
78 exons_length = [] | |
79 current_start = -1 | |
80 current_end = None | |
81 for e in db.children(tr, featuretype='exon', order_by='start'): | |
82 if current_start == -1: | |
83 current_start = e.start - 1 | |
84 current_end = e.end | |
85 else: | |
86 if e.start > current_end: | |
87 # This is a non-overlapping exon | |
88 # We store the previous exon: | |
89 exons_starts.append(current_start) | |
90 exons_length.append(current_end - current_start) | |
91 # We set the current: | |
92 current_start = e.start - 1 | |
93 current_end = e.end | |
94 else: | |
95 # This is an overlapping exon | |
96 # We update current_end if necessary | |
97 current_end = max(current_end, e.end) | |
98 if current_start != -1: | |
99 # There is a last exon to store: | |
100 exons_starts.append(current_start) | |
101 exons_length.append(current_end - current_start) | |
102 else: | |
103 exons_starts = [e.start - 1 | |
104 for e in | |
105 db.children(tr, featuretype='exon', | |
106 order_by='start')] | |
107 exons_length = [len(e) | |
108 for e in | |
109 db.children(tr, featuretype='exon', | |
110 order_by='start')] | |
111 # Rewrite the chromosome name if needed: | |
112 chrom = tr.chrom | |
113 if ucsc and chrom[0:3] != 'chr': | |
114 chrom = 'chr' + chrom | |
115 fo.write("%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%s\t%s\n" % | |
116 (chrom, tr.start - 1, tr.end, trName, 0, tr.strand, | |
117 cds_start, cds_end, "0", len(exons_starts), | |
118 ",".join([str(ex_l) for ex_l in exons_length]), | |
119 ",".join([str(s - (tr.start - 1)) for s in exons_starts]))) | |
120 | |
121 | |
122 argp = argparse.ArgumentParser( | |
123 description=("Convert a gtf to a bed12 with one entry" | |
124 " per transcript/gene")) | |
125 argp.add_argument('input', default=None, | |
126 help="Input gtf file (can be gzip).") | |
127 argp.add_argument('--output', default=sys.stdout, | |
128 type=argparse.FileType('w'), | |
129 help="Output bed12 file.") | |
130 argp.add_argument('--useGene', action="store_true", | |
131 help="Use the gene name instead of the " | |
132 "transcript name.") | |
133 argp.add_argument('--ucscformat', action="store_true", | |
134 help="If you want that all chromosome names " | |
135 "begin with 'chr'.") | |
136 group = argp.add_mutually_exclusive_group() | |
137 group.add_argument('--mergeTranscripts', action="store_true", | |
138 help="Merge all transcripts into a single " | |
139 "entry to have one line per gene.") | |
140 group.add_argument('--mergeTranscriptsAndOverlappingExons', | |
141 action="store_true", | |
142 help="Merge all transcripts into a single " | |
143 "entry to have one line per gene and merge" | |
144 " overlapping exons.") | |
145 | |
146 args = argp.parse_args() | |
147 convert_gtf_to_bed(args.input, args.output, args.useGene, | |
148 args.mergeTranscripts, | |
149 args.mergeTranscriptsAndOverlappingExons, | |
150 args.ucscformat) |