comparison purge_gtf_from_multichrom_genes.py @ 6:45a30e216fec draft

planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
author artbio
date Sun, 03 Jun 2018 19:50:57 -0400
parents
children
comparison
equal deleted inserted replaced
5:835a2f3372b5 6:45a30e216fec
1 #!/usr/bin/env python
2
3 import argparse
4 from collections import defaultdict
5
6
7 def command_parse():
8 parser = argparse.ArgumentParser(description='Purge GTF file from genes \
9 that are on several chromosomes and list them in a log file')
10 parser.add_argument(
11 '-i', '--input', dest='input', help='input GTF file', required=True)
12 parser.add_argument('-o', '--output', dest='output', help='output file \
13 name', default='output.gtf')
14 parser.add_argument('-l', '--log', dest='log', help='log of purged \
15 genes', default='purged_genes.log')
16 args = parser.parse_args()
17 return args
18
19
20 def get_genes(gtf_file):
21 genes = defaultdict(list)
22 with open(gtf_file, 'r') as fh:
23 for line in fh:
24 if line[0] != '#':
25 fields = line[:-1].split("\t")
26 chrom = fields[0]
27 name_gene = fields[-1].split('gene_id "')[-1].split('"; \
28 transcript_id')[0]
29 genes[name_gene].append(chrom)
30 return genes
31
32
33 def generate_output(genes, log_file):
34 '''
35 Search for all genes that are present on several chromosomes. This function
36 return a list of these genes in target_genes. It also generate a log tab
37 delimited file with one gene per line and with its list of chromosomes
38 (coma delimited)
39 '''
40 output = open(log_file, 'w')
41 # output.write('#all genes on several chromosomes' + '\n')
42 target_genes = list()
43 for name_gene in genes.keys():
44 genes[name_gene] = set(genes[name_gene])
45 if len(genes[name_gene]) > 1:
46 target_genes.append(name_gene)
47 new_line = '\t'.join([name_gene, ','.join(genes[name_gene])])
48 output.write("%s\n" % new_line)
49 output.close()
50 return target_genes
51
52
53 def purge_gtf(target_genes, gtf_file, output_file):
54 '''
55 Remove all lines of the gtf file where the gene_id is gene of target_genes
56 list.
57 '''
58 output_gtf = open(output_file, 'w')
59 with open(gtf_file, 'r') as gtf_handler:
60 for line in gtf_handler:
61 fields = line[:-1].split("\t")
62 gene_name = fields[-1].split('gene_id "')[-1].split('"; \
63 transcript_id')[0]
64 if gene_name not in target_genes:
65 output_gtf.write(line)
66 output_gtf.close()
67
68
69 def __main__():
70 args = command_parse()
71 genes = get_genes(args.input)
72 target_genes = generate_output(genes, args.log)
73 purge_gtf(target_genes, args.input, args.output)
74
75
76 if __name__ == "__main__":
77 __main__()