# HG changeset patch # User brigidar # Date 1444426250 14400 # Node ID 82186605ba751f71532f77ccac7674d8b3268054 # Parent 0c769b205d1837d2be4a4ffce167b84674ba361b Uploaded diff -r 0c769b205d18 -r 82186605ba75 select_product.py --- a/select_product.py Fri Oct 09 11:52:37 2015 -0400 +++ b/select_product.py Fri Oct 09 17:30:50 2015 -0400 @@ -38,33 +38,6 @@ return FH -def feature_table(chr_id, source, orient, genes, transcripts, cds, exons, product, keys_1, n): - """ - Write the feature information - """ - - lines = [] - - for gname, ginfo in genes.items(): - line = [str(chr_id), - ginfo[3], - str(ginfo[0]-n), - str(ginfo[1]+n), - ginfo[2], - str(product)] - -# to key on specfic products only - for k in keys_1.split(","): - if k in str(product): - - lines.append('\t'.join(line)+"\n") - break - - return lines - - - - def gbk_parse(fname,keys_1,n,outfile=""): """ Extract genome annotation recods from genbank format @@ -77,9 +50,7 @@ for record in SeqIO.parse(fhand, "genbank"): gene_tags = dict() - tx_tags = collections.defaultdict(list) - exon = collections.defaultdict(list) - cds = collections.defaultdict(list) + mol_type, chr_id = None, None for rec in record.features: @@ -90,7 +61,7 @@ if rec.qualifiers.has_key('product'): product = rec.qualifiers['product'][0] - + if rec.type == 'source': try: @@ -103,49 +74,25 @@ except: chr_id = record.name continue + - strand='-' - strand='+' if rec.strand>0 else strand - - fid = None - try: - fid = rec.qualifiers['gene'][0] - except: - pass - - transcript_id = None - try: - transcript_id = rec.qualifiers['transcript_id'][0] - except: - pass + start = int(rec.location.start)+1-n + stop = int(rec.location.end)+n - if re.search(r'gene', rec.type): - gene_tags[fid] = (rec.location._start.position+1, - rec.location._end.position, - strand, - rec.type - ) - elif rec.type == 'exon': - exon[fid].append((rec.location._start.position+1, - rec.location._end.position)) - elif rec.type=='CDS': - cds[fid].append((rec.location._start.position+1, - rec.location._end.position)) - else: - # get all transcripts - if transcript_id: - tx_tags[fid].append((rec.location._start.position+1, - rec.location._end.position, - transcript_id, - rec.type)) - # record extracted, generate feature table - lines += feature_table(chr_id, mol_type, strand, gene_tags, tx_tags, cds, exon, product,keys_1,n) - + if start < 0: + start = 1 + + for k in keys_1.split(","): + + if str(k) in str(product): + lines.append('\t'.join([str(chr_id),str(start), str(stop), product])+"\n") + + fhand.close() return lines -if __name__=='__main__': +if __name__=='__main__': try: parser = argparse.ArgumentParser()