Mercurial > repos > lldelisle > fromgtftobed12
diff fromgtfTobed12.py @ 0:418e4d0fe0bd draft
planemo upload for repository https://github.com/lldelisle/tools-lldelisle/tree/master/tools/fromgtfTobed12 commit 1aaffda5b95e0389e315179345642c0d005867c1
author | lldelisle |
---|---|
date | Fri, 04 Nov 2022 15:37:12 +0000 |
parents | |
children | 6fd4b3b90220 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fromgtfTobed12.py Fri Nov 04 15:37:12 2022 +0000 @@ -0,0 +1,150 @@ +import argparse +import sys +import warnings + +import gffutils + +warnings.filterwarnings("ignore", message="It appears you have a gene feature" + " in your GTF file. You may want to use the " + "`disable_infer_genes` option to speed up database " + "creation") +warnings.filterwarnings("ignore", message="It appears you have a transcript " + "feature in your GTF file. You may want to use the " + "`disable_infer_transcripts` option to speed up " + "database creation") +# In gffutils v0.10 they changed the error message: +warnings.filterwarnings("ignore", message="It appears you have a gene feature" + " in your GTF file. You may want to use the " + "`disable_infer_genes=True` option to speed up " + "database creation") +warnings.filterwarnings("ignore", message="It appears you have a transcript " + "feature in your GTF file. You may want to use the " + "`disable_infer_transcripts=True` option to speed up " + "database creation") + + +def convert_gtf_to_bed(fn, fo, useGene, mergeTranscripts, + mergeTranscriptsAndOverlappingExons, ucsc): + db = gffutils.create_db(fn, ':memory:') + # For each transcript: + prefered_name = "transcript_name" + if useGene or mergeTranscripts or mergeTranscriptsAndOverlappingExons: + prefered_name = "gene_name" + if mergeTranscripts or mergeTranscriptsAndOverlappingExons: + all_items = db.features_of_type("gene", order_by='start') + else: + all_items = db.features_of_type("transcript", order_by='start') + for tr in all_items: + # The name would be the name of the transcript/gene if exists + try: + # First try to have it directly on the feature + trName = tr.attributes[prefered_name][0] + except KeyError: + # Else try to guess the name of the transcript/gene from exons: + try: + trName = set([e.attributes[prefered_name][0] + for e in + db.children(tr, + featuretype='exon', + order_by='start')]).pop() + except KeyError: + # Else take the transcript id + trName = tr.id + # If the cds is defined in the gtf, + # use it to define the thick start and end + # The gtf is 1-based closed intervalls and + # bed are 0-based half-open so: + # I need to remove one from each start + try: + # In case of multiple CDS (when there is one entry per gene) + # I use the first one to get the start + # and the last one to get the end (order_by=-start) + cds_start = next(db.children(tr, + featuretype='CDS', + order_by='start')).start - 1 + cds_end = next(db.children(tr, + featuretype='CDS', + order_by='-start')).end + except StopIteration: + # If the CDS is not defined, then it is set to the start + # as proposed here: + # https://genome.ucsc.edu/FAQ/FAQformat.html#format1 + cds_start = tr.start - 1 + cds_end = tr.start - 1 + # Get all exons starts and lengths + if mergeTranscriptsAndOverlappingExons: + # We merge overlapping exons: + exons_starts = [] + exons_length = [] + current_start = -1 + current_end = None + for e in db.children(tr, featuretype='exon', order_by='start'): + if current_start == -1: + current_start = e.start - 1 + current_end = e.end + else: + if e.start > current_end: + # This is a non-overlapping exon + # We store the previous exon: + exons_starts.append(current_start) + exons_length.append(current_end - current_start) + # We set the current: + current_start = e.start - 1 + current_end = e.end + else: + # This is an overlapping exon + # We update current_end if necessary + current_end = max(current_end, e.end) + if current_start != -1: + # There is a last exon to store: + exons_starts.append(current_start) + exons_length.append(current_end - current_start) + else: + exons_starts = [e.start - 1 + for e in + db.children(tr, featuretype='exon', + order_by='start')] + exons_length = [len(e) + for e in + db.children(tr, featuretype='exon', + order_by='start')] + # Rewrite the chromosome name if needed: + chrom = tr.chrom + if ucsc and chrom[0:3] != 'chr': + chrom = 'chr' + chrom + fo.write("%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%s\t%s\n" % + (chrom, tr.start - 1, tr.end, trName, 0, tr.strand, + cds_start, cds_end, "0", len(exons_starts), + ",".join([str(ex_l) for ex_l in exons_length]), + ",".join([str(s - (tr.start - 1)) for s in exons_starts]))) + + +argp = argparse.ArgumentParser( + description=("Convert a gtf to a bed12 with one entry" + " per transcript/gene")) +argp.add_argument('input', default=None, + help="Input gtf file (can be gzip).") +argp.add_argument('--output', default=sys.stdout, + type=argparse.FileType('w'), + help="Output bed12 file.") +argp.add_argument('--useGene', action="store_true", + help="Use the gene name instead of the " + "transcript name.") +argp.add_argument('--ucscformat', action="store_true", + help="If you want that all chromosome names " + "begin with 'chr'.") +group = argp.add_mutually_exclusive_group() +group.add_argument('--mergeTranscripts', action="store_true", + help="Merge all transcripts into a single " + "entry to have one line per gene.") +group.add_argument('--mergeTranscriptsAndOverlappingExons', + action="store_true", + help="Merge all transcripts into a single " + "entry to have one line per gene and merge" + " overlapping exons.") + +args = argp.parse_args() +convert_gtf_to_bed(args.input, args.output, args.useGene, + args.mergeTranscripts, + args.mergeTranscriptsAndOverlappingExons, + args.ucscformat)