diff fml_gff_groomer/scripts/gff_available_limits.py @ 0:79726c328621 default tip

Migrated tool version 1.0.0 from old tool shed archive to new tool shed repository
author vipints
date Tue, 07 Jun 2011 17:29:24 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fml_gff_groomer/scripts/gff_available_limits.py	Tue Jun 07 17:29:24 2011 -0400
@@ -0,0 +1,79 @@
+#!/usr/bin/env python
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# Written (W) 2010 Vipin T Sreedharan, Friedrich Miescher Laboratory of the Max Planck Society
+# Copyright (C) 2010 Max Planck Society
+#
+# Description : Provide available source, feature types from a GFF file 
+
+import re, sys
+import time 
+import collections
+
+def available_limits(gff_handle):
+    """Figure out the available feature types from the given GFF file"""
+
+    filter_info = dict(gff_id = [0], gff_source_type = [1, 2],
+                gff_source = [1], gff_type = [2])
+    cur_limits = dict()
+    for filter_key in filter_info.keys():
+        cur_limits[filter_key] = collections.defaultdict(int)
+    for line in gff_handle:
+        if line.strip('\n\r')[0] != "#":
+            parts = [p.strip() for p in line.split('\t')]
+            if len(parts) == 1 and re.search(r'\w+', parts[0]):continue ## GFF files with FASTA sequence together 
+            assert len(parts) == 9, line
+            for filter_key, cur_indexes in filter_info.items():
+                cur_id = tuple([parts[i] for i in cur_indexes])
+                cur_limits[filter_key][cur_id] += 1
+    # get rid of the default dicts
+    final_dict = dict()
+    for key, value_dict in cur_limits.items():
+        if len(key) == 1:
+            key = key[0]
+        final_dict[key] = dict(value_dict)
+   
+    return final_dict
+
+if __name__=='__main__':
+    
+    stime = time.asctime( time.localtime(time.time()) )
+    print '-------------------------------------------------------'
+    print 'FeatureScan started on ' + stime
+    print '-------------------------------------------------------'
+
+    try:
+        gff_handle = open(sys.argv[1], 'rU')
+    except:
+        sys.stderr.write("Can't open the GFF3 file, terminating...\n")
+        sys.stderr.write("USAGE: gff_available_limits.py <gff file>\n")
+        sys.exit(-1)
+    final_dict = available_limits(gff_handle)
+    gff_handle.close()
+    print 
+    print "==Overview of available source(s) and feature type(s) from GFF file=="
+    print 
+    print "Chromosome identifier(s) and corresponding count:"
+    for contig, cnt in sorted(final_dict['gff_id'].items()):
+        print '\t' + str(contig[0]) + '\t' + str(cnt)
+    print
+    print "Source(s) of feature and corresponding count:"
+    for source, cnt in sorted(final_dict['gff_source'].items()):
+        print '\t' + str(source[0]) + '\t' + str(cnt)
+    print
+    print "Feature type(s) and corresponding count:"
+    for ftype, cnt in sorted(final_dict['gff_type'].items()):
+        print '\t' + str(cnt) + '\t' + str(ftype[0]) 
+    print
+    print "Unique combination of Feature type(s), Source(s) and corresponding count:"
+    for sftype, cnt in sorted(final_dict['gff_source_type'].items()):
+        print '\t' + str(cnt) + '\t' + str(sftype[0]) + ', '+ str(sftype[1])
+    print
+    stime = time.asctime( time.localtime(time.time()) )
+    print '-------------------------------------------------------'
+    print 'FeatureScan finished at ' + stime
+    print '-------------------------------------------------------'