Mercurial > repos > artbio > rsem
annotate purge_gtf_from_multichrom_genes.py @ 9:e00a79cf5f8c draft default tip
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit fdb6ccd340e366ef386d69ff344036a83e479b2f"
author | artbio |
---|---|
date | Tue, 03 Mar 2020 07:28:39 -0500 |
parents | 45a30e216fec |
children |
rev | line source |
---|---|
6
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
1 #!/usr/bin/env python |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
2 |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
3 import argparse |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
4 from collections import defaultdict |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
5 |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
6 |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
7 def command_parse(): |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
8 parser = argparse.ArgumentParser(description='Purge GTF file from genes \ |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
9 that are on several chromosomes and list them in a log file') |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
10 parser.add_argument( |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
11 '-i', '--input', dest='input', help='input GTF file', required=True) |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
12 parser.add_argument('-o', '--output', dest='output', help='output file \ |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
13 name', default='output.gtf') |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
14 parser.add_argument('-l', '--log', dest='log', help='log of purged \ |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
15 genes', default='purged_genes.log') |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
16 args = parser.parse_args() |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
17 return args |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
18 |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
19 |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
20 def get_genes(gtf_file): |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
21 genes = defaultdict(list) |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
22 with open(gtf_file, 'r') as fh: |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
23 for line in fh: |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
24 if line[0] != '#': |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
25 fields = line[:-1].split("\t") |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
26 chrom = fields[0] |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
27 name_gene = fields[-1].split('gene_id "')[-1].split('"; \ |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
28 transcript_id')[0] |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
29 genes[name_gene].append(chrom) |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
30 return genes |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
31 |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
32 |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
33 def generate_output(genes, log_file): |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
34 ''' |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
35 Search for all genes that are present on several chromosomes. This function |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
36 return a list of these genes in target_genes. It also generate a log tab |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
37 delimited file with one gene per line and with its list of chromosomes |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
38 (coma delimited) |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
39 ''' |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
40 output = open(log_file, 'w') |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
41 # output.write('#all genes on several chromosomes' + '\n') |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
42 target_genes = list() |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
43 for name_gene in genes.keys(): |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
44 genes[name_gene] = set(genes[name_gene]) |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
45 if len(genes[name_gene]) > 1: |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
46 target_genes.append(name_gene) |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
47 new_line = '\t'.join([name_gene, ','.join(genes[name_gene])]) |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
48 output.write("%s\n" % new_line) |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
49 output.close() |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
50 return target_genes |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
51 |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
52 |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
53 def purge_gtf(target_genes, gtf_file, output_file): |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
54 ''' |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
55 Remove all lines of the gtf file where the gene_id is gene of target_genes |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
56 list. |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
57 ''' |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
58 output_gtf = open(output_file, 'w') |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
59 with open(gtf_file, 'r') as gtf_handler: |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
60 for line in gtf_handler: |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
61 fields = line[:-1].split("\t") |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
62 gene_name = fields[-1].split('gene_id "')[-1].split('"; \ |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
63 transcript_id')[0] |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
64 if gene_name not in target_genes: |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
65 output_gtf.write(line) |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
66 output_gtf.close() |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
67 |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
68 |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
69 def __main__(): |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
70 args = command_parse() |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
71 genes = get_genes(args.input) |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
72 target_genes = generate_output(genes, args.log) |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
73 purge_gtf(target_genes, args.input, args.output) |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
74 |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
75 |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
76 if __name__ == "__main__": |
45a30e216fec
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 4bc762d0932b87d4e91ce76bc3eeb52f0b8d3bc6
artbio
parents:
diff
changeset
|
77 __main__() |