Mercurial > repos > brigidar > select_product

--- a/select_product.py	Fri Oct 09 11:52:37 2015 -0400
+++ b/select_product.py	Fri Oct 09 17:30:50 2015 -0400
@@ -38,33 +38,6 @@
     return FH


-def feature_table(chr_id, source, orient, genes, transcripts, cds, exons, product, keys_1, n):
-    """
-    Write the feature information
-    """
-
-    lines = []
-
-    for gname, ginfo in genes.items():
-        line = [str(chr_id),
-                ginfo[3],
-                str(ginfo[0]-n),
-                str(ginfo[1]+n),
-                ginfo[2],
-                str(product)]
-
-# to key on specfic products only
-        for k in keys_1.split(","):
-            if k in str(product):
-
-                lines.append('\t'.join(line)+"\n")
-                break
-
-    return lines
-
-
-
-
 def gbk_parse(fname,keys_1,n,outfile=""):
     """
     Extract genome annotation recods from genbank format
@@ -77,9 +50,7 @@

     for record in SeqIO.parse(fhand, "genbank"):
         gene_tags = dict()
-        tx_tags = collections.defaultdict(list)
-        exon = collections.defaultdict(list)
-        cds = collections.defaultdict(list)
+
         mol_type, chr_id = None, None

         for rec in record.features:
@@ -90,7 +61,7 @@
             if rec.qualifiers.has_key('product'):

                 product = rec.qualifiers['product'][0]
-
+

             if rec.type == 'source':
                 try:
@@ -103,49 +74,25 @@
                 except:
                     chr_id = record.name
                 continue
+

-            strand='-'
-            strand='+' if rec.strand>0 else strand
-
-            fid = None
-            try:
-                fid = rec.qualifiers['gene'][0]
-            except:
-                pass
-
-            transcript_id = None
-            try:
-                transcript_id = rec.qualifiers['transcript_id'][0]
-            except:
-                pass
+            start = int(rec.location.start)+1-n
+            stop = int(rec.location.end)+n

-            if re.search(r'gene', rec.type):
-                gene_tags[fid] = (rec.location._start.position+1,
-                                    rec.location._end.position,
-                                    strand,
-                                    rec.type
-                                    )
-            elif rec.type == 'exon':
-                exon[fid].append((rec.location._start.position+1,
-                                    rec.location._end.position))
-            elif rec.type=='CDS':
-                cds[fid].append((rec.location._start.position+1,
-                                    rec.location._end.position))
-            else:
-                # get all transcripts
-                if transcript_id:
-                    tx_tags[fid].append((rec.location._start.position+1,
-                                    rec.location._end.position,
-                                    transcript_id,
-                                    rec.type))
-        # record extracted, generate feature table
-        lines += feature_table(chr_id, mol_type, strand, gene_tags, tx_tags, cds, exon, product,keys_1,n)
-
+            if start < 0:
+                start = 1
+
+            for k in keys_1.split(","):
+
+                if str(k) in str(product):
+                    lines.append('\t'.join([str(chr_id),str(start), str(stop), product])+"\n")
+
+
     fhand.close()

     return lines

-if __name__=='__main__':
+if __name__=='__main__':

     try:
         parser = argparse.ArgumentParser()