comparison fml_gff_groomer/scripts/gff_available_limits.py @ 0:79726c328621 default tip

Migrated tool version 1.0.0 from old tool shed archive to new tool shed repository
author vipints
date Tue, 07 Jun 2011 17:29:24 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:79726c328621
1 #!/usr/bin/env python
2 #
3 # This program is free software; you can redistribute it and/or modify
4 # it under the terms of the GNU General Public License as published by
5 # the Free Software Foundation; either version 3 of the License, or
6 # (at your option) any later version.
7 #
8 # Written (W) 2010 Vipin T Sreedharan, Friedrich Miescher Laboratory of the Max Planck Society
9 # Copyright (C) 2010 Max Planck Society
10 #
11 # Description : Provide available source, feature types from a GFF file
12
13 import re, sys
14 import time
15 import collections
16
17 def available_limits(gff_handle):
18 """Figure out the available feature types from the given GFF file"""
19
20 filter_info = dict(gff_id = [0], gff_source_type = [1, 2],
21 gff_source = [1], gff_type = [2])
22 cur_limits = dict()
23 for filter_key in filter_info.keys():
24 cur_limits[filter_key] = collections.defaultdict(int)
25 for line in gff_handle:
26 if line.strip('\n\r')[0] != "#":
27 parts = [p.strip() for p in line.split('\t')]
28 if len(parts) == 1 and re.search(r'\w+', parts[0]):continue ## GFF files with FASTA sequence together
29 assert len(parts) == 9, line
30 for filter_key, cur_indexes in filter_info.items():
31 cur_id = tuple([parts[i] for i in cur_indexes])
32 cur_limits[filter_key][cur_id] += 1
33 # get rid of the default dicts
34 final_dict = dict()
35 for key, value_dict in cur_limits.items():
36 if len(key) == 1:
37 key = key[0]
38 final_dict[key] = dict(value_dict)
39
40 return final_dict
41
42 if __name__=='__main__':
43
44 stime = time.asctime( time.localtime(time.time()) )
45 print '-------------------------------------------------------'
46 print 'FeatureScan started on ' + stime
47 print '-------------------------------------------------------'
48
49 try:
50 gff_handle = open(sys.argv[1], 'rU')
51 except:
52 sys.stderr.write("Can't open the GFF3 file, terminating...\n")
53 sys.stderr.write("USAGE: gff_available_limits.py <gff file>\n")
54 sys.exit(-1)
55 final_dict = available_limits(gff_handle)
56 gff_handle.close()
57 print
58 print "==Overview of available source(s) and feature type(s) from GFF file=="
59 print
60 print "Chromosome identifier(s) and corresponding count:"
61 for contig, cnt in sorted(final_dict['gff_id'].items()):
62 print '\t' + str(contig[0]) + '\t' + str(cnt)
63 print
64 print "Source(s) of feature and corresponding count:"
65 for source, cnt in sorted(final_dict['gff_source'].items()):
66 print '\t' + str(source[0]) + '\t' + str(cnt)
67 print
68 print "Feature type(s) and corresponding count:"
69 for ftype, cnt in sorted(final_dict['gff_type'].items()):
70 print '\t' + str(cnt) + '\t' + str(ftype[0])
71 print
72 print "Unique combination of Feature type(s), Source(s) and corresponding count:"
73 for sftype, cnt in sorted(final_dict['gff_source_type'].items()):
74 print '\t' + str(cnt) + '\t' + str(sftype[0]) + ', '+ str(sftype[1])
75 print
76 stime = time.asctime( time.localtime(time.time()) )
77 print '-------------------------------------------------------'
78 print 'FeatureScan finished at ' + stime
79 print '-------------------------------------------------------'