Mercurial > repos > artbio > rsem
comparison purge_gtf_from_multichrom_genes.py @ 6:45a30e216fec draft
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
author | artbio |
---|---|
date | Sun, 03 Jun 2018 19:50:57 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
5:835a2f3372b5 | 6:45a30e216fec |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 import argparse | |
4 from collections import defaultdict | |
5 | |
6 | |
7 def command_parse(): | |
8 parser = argparse.ArgumentParser(description='Purge GTF file from genes \ | |
9 that are on several chromosomes and list them in a log file') | |
10 parser.add_argument( | |
11 '-i', '--input', dest='input', help='input GTF file', required=True) | |
12 parser.add_argument('-o', '--output', dest='output', help='output file \ | |
13 name', default='output.gtf') | |
14 parser.add_argument('-l', '--log', dest='log', help='log of purged \ | |
15 genes', default='purged_genes.log') | |
16 args = parser.parse_args() | |
17 return args | |
18 | |
19 | |
20 def get_genes(gtf_file): | |
21 genes = defaultdict(list) | |
22 with open(gtf_file, 'r') as fh: | |
23 for line in fh: | |
24 if line[0] != '#': | |
25 fields = line[:-1].split("\t") | |
26 chrom = fields[0] | |
27 name_gene = fields[-1].split('gene_id "')[-1].split('"; \ | |
28 transcript_id')[0] | |
29 genes[name_gene].append(chrom) | |
30 return genes | |
31 | |
32 | |
33 def generate_output(genes, log_file): | |
34 ''' | |
35 Search for all genes that are present on several chromosomes. This function | |
36 return a list of these genes in target_genes. It also generate a log tab | |
37 delimited file with one gene per line and with its list of chromosomes | |
38 (coma delimited) | |
39 ''' | |
40 output = open(log_file, 'w') | |
41 # output.write('#all genes on several chromosomes' + '\n') | |
42 target_genes = list() | |
43 for name_gene in genes.keys(): | |
44 genes[name_gene] = set(genes[name_gene]) | |
45 if len(genes[name_gene]) > 1: | |
46 target_genes.append(name_gene) | |
47 new_line = '\t'.join([name_gene, ','.join(genes[name_gene])]) | |
48 output.write("%s\n" % new_line) | |
49 output.close() | |
50 return target_genes | |
51 | |
52 | |
53 def purge_gtf(target_genes, gtf_file, output_file): | |
54 ''' | |
55 Remove all lines of the gtf file where the gene_id is gene of target_genes | |
56 list. | |
57 ''' | |
58 output_gtf = open(output_file, 'w') | |
59 with open(gtf_file, 'r') as gtf_handler: | |
60 for line in gtf_handler: | |
61 fields = line[:-1].split("\t") | |
62 gene_name = fields[-1].split('gene_id "')[-1].split('"; \ | |
63 transcript_id')[0] | |
64 if gene_name not in target_genes: | |
65 output_gtf.write(line) | |
66 output_gtf.close() | |
67 | |
68 | |
69 def __main__(): | |
70 args = command_parse() | |
71 genes = get_genes(args.input) | |
72 target_genes = generate_output(genes, args.log) | |
73 purge_gtf(target_genes, args.input, args.output) | |
74 | |
75 | |
76 if __name__ == "__main__": | |
77 __main__() |