Mercurial > repos > iuc > infercnv
diff gtf_to_position_file.py @ 0:be7c0c692879 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/infercnv commit 7e7abdef47fdf68f3ca69b75a8477dabc7bfa965
author | iuc |
---|---|
date | Tue, 23 Jul 2024 15:43:10 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gtf_to_position_file.py Tue Jul 23 15:43:10 2024 +0000 @@ -0,0 +1,146 @@ +#!/usr/bin/env python + + +""" +Converts GTF files to proprietary formats. +""" + + +# Import statements +import argparse +import csv +import gzip +import os + + +__author__ = 'Timothy Tickle, Itay Tirosh, Brian Haas' +__copyright__ = 'Copyright 2016' +__credits__ = ["Timothy Tickle"] +__license__ = 'BSD-3' +__maintainer__ = 'Timothy Tickle' +__email__ = 'ttickle@bbroadinstitute.org' +__status__ = 'Development' + + +def open_file(file_path): + """ Open a file, handling gzip if necessary. + + :param file_path: Path to input file + :type file_path: String + + :returns: File object + """ + if file_path.endswith('.gz'): + return gzip.open(file_path, 'rt') + else: + return open(file_path, 'r') + + +def convert_to_positional_file(input_gtf, output_positional, attribute_key): + """ Convert input GTF file to positional file. + + :param input_gtf: Path to input gtf file + :type input_gtf: String + :param output_positional: Path to output positional file + :type output_positional: String + :param attribute_key: Key of the GTF attribute to use for feature/row names + :type attribute_key: String + + :returns: Indicator of success (True) or Failure (False) + :rtype: boolean + """ + + if not input_gtf or not os.path.exists(input_gtf): + print("".join(["gtf_to_position_file.py:: ", + "Could not find input file : " + input_gtf])) + return False + + all_genes_found = set() + + # Holds lines to output after parsing. + output_line = [] + previous_gene = None + previous_chr = None + gene_positions = [] + + # Metrics for the file + i_comments = 0 + i_duplicate_entries = 0 + i_entries = 0 + i_accepted_entries = 0 + i_written_lines = 0 + + with open_file(input_gtf) as gtf: + gtf_file = csv.reader(gtf, delimiter="\t") + for gtf_line in gtf_file: + if gtf_line[0][0] == "#": + i_comments += 1 + continue + i_entries += 1 + # Clean up the attribute keys and match the one of interest. + attributes = gtf_line[8].split(";") + attributes = [entry.strip(" ") for entry in attributes] + attributes = [entry.split(" ") for entry in attributes if entry] + attributes = [[entry[0].strip('"'), entry[1].strip('"')] for entry in attributes] + attributes = dict([[entry[0].split("|")[0], entry[1]] for entry in attributes]) + if attribute_key in attributes: + gene_name = attributes[attribute_key] + else: + print("Could not find an attribute in the GTF with the name '" + attribute_key + "'. Line=" + "\t".join(gtf_line)) + exit(99) + if not gene_name == previous_gene: + if len(gene_positions) > 1 and previous_gene not in all_genes_found: + i_accepted_entries += 1 + gene_positions.sort() + output_line.append("\t".join([previous_gene, + previous_chr, + str(gene_positions[0]), + str(gene_positions[-1])])) + all_genes_found.add(previous_gene) + gene_positions = [] + else: + i_duplicate_entries += 1 + gene_positions += [int(gtf_line[3]), int(gtf_line[4])] + previous_gene = gene_name + previous_chr = gtf_line[0] + if previous_gene and previous_chr and len(gene_positions) > 1: + i_accepted_entries += 1 + gene_positions.sort() + output_line.append("\t".join([previous_gene, + previous_chr, + str(gene_positions[0]), + str(gene_positions[-1])])) + + with open(output_positional, "w") as positional_file: + i_written_lines += len(output_line) + positional_file.write("\n".join(output_line)) + + # Print metrics + print("Number of lines read: " + str(i_entries)) + print("Number of comments: " + str(i_comments)) + print("Number of entries: " + str(i_accepted_entries)) + print("Number of duplicate entries: " + str(i_duplicate_entries)) + print("Number of entries written: " + str(i_written_lines)) + + +if __name__ == "__main__": + + # Parse arguments + prsr_arguments = argparse.ArgumentParser(prog='gtf_to_position_file.py', + description='Convert a GTF file to a positional file.', + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + # Add positional argument + prsr_arguments.add_argument("input_gtf", + metavar="input_gtf", + help="Path to the input GTF file.") + prsr_arguments.add_argument("--attribute_name", + metavar="attribute_name", + default="gene_id", + help="The name of the attribute in the GTF attributes to use instead of gene name, for example 'gene_name' or 'transcript_id'.") + prsr_arguments.add_argument("output_positional", + metavar="output_positional", + help="Path for the output positional file.") + args = prsr_arguments.parse_args() + + # Run Script + convert_to_positional_file(args.input_gtf, args.output_positional, args.attribute_name)