diff purge_gtf_from_multichrom_genes.py @ 6:45a30e216fec draft

planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
author artbio
date Sun, 03 Jun 2018 19:50:57 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/purge_gtf_from_multichrom_genes.py	Sun Jun 03 19:50:57 2018 -0400
@@ -0,0 +1,77 @@
+#!/usr/bin/env python
+
+import argparse
+from collections import defaultdict
+
+
+def command_parse():
+    parser = argparse.ArgumentParser(description='Purge GTF file from genes \
+        that are on several chromosomes and list them in a log file')
+    parser.add_argument(
+        '-i', '--input', dest='input', help='input GTF file', required=True)
+    parser.add_argument('-o', '--output', dest='output', help='output file \
+        name', default='output.gtf')
+    parser.add_argument('-l', '--log', dest='log', help='log of purged \
+        genes', default='purged_genes.log')
+    args = parser.parse_args()
+    return args
+
+
+def get_genes(gtf_file):
+    genes = defaultdict(list)
+    with open(gtf_file, 'r') as fh:
+        for line in fh:
+            if line[0] != '#':
+                fields = line[:-1].split("\t")
+                chrom = fields[0]
+                name_gene = fields[-1].split('gene_id "')[-1].split('"; \
+                    transcript_id')[0]
+                genes[name_gene].append(chrom)
+    return genes
+
+
+def generate_output(genes, log_file):
+    '''
+    Search for all genes that are present on several chromosomes. This function
+    return a list of these genes in target_genes. It also generate a log tab
+    delimited file with one gene per line and with its list of chromosomes
+    (coma delimited)
+    '''
+    output = open(log_file, 'w')
+    # output.write('#all genes on several chromosomes' + '\n')
+    target_genes = list()
+    for name_gene in genes.keys():
+        genes[name_gene] = set(genes[name_gene])
+        if len(genes[name_gene]) > 1:
+            target_genes.append(name_gene)
+            new_line = '\t'.join([name_gene, ','.join(genes[name_gene])])
+            output.write("%s\n" % new_line)
+    output.close()
+    return target_genes
+
+
+def purge_gtf(target_genes, gtf_file, output_file):
+    '''
+    Remove all lines of the gtf file where the gene_id is gene of target_genes
+    list.
+    '''
+    output_gtf = open(output_file, 'w')
+    with open(gtf_file, 'r') as gtf_handler:
+        for line in gtf_handler:
+            fields = line[:-1].split("\t")
+            gene_name = fields[-1].split('gene_id "')[-1].split('"; \
+                transcript_id')[0]
+            if gene_name not in target_genes:
+                output_gtf.write(line)
+    output_gtf.close()
+
+
+def __main__():
+    args = command_parse()
+    genes = get_genes(args.input)
+    target_genes = generate_output(genes, args.log)
+    purge_gtf(target_genes, args.input, args.output)
+
+
+if __name__ == "__main__":
+    __main__()