gstf_preparation: gstf_preparation.py comparison

comparison gstf_preparation.py @ 10:e8e75a79de59 draft

"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9c8611fee927883f50bc6955771aa69df1ce8457"

author	earlhaminst
date	Thu, 31 Oct 2019 08:16:51 -0400
parents	f4acbfe8d6fe
children	dbe37a658cd2

comparison

equal deleted inserted replaced

-:f4acbfe8d6fe
+:e8e75a79de59
 from __future__ import print_function
 import json
 import optparse
+import os
 import sqlite3
 import sys
 version = "0.4.0"
 gene_count = 0
 if parent_dict is not None and 'Parent' in d:
 # a 3' UTR can be split among multiple exons
 # a 5' UTR can be split among multiple exons
 # a CDS can be part of multiple transcripts
 for parent in d['Parent'].split(','):
-if parent not in parent_dict:
+parent_dict.setdefault(parent, []).append(d)
-parent_dict[parent] = [d]
-else:
-parent_dict[parent].append(d)
 return d
 def add_gene_to_dict(cols, species, gene_dict):
 global gene_count
 gene_count = gene_count + 1
 def add_transcript_to_dict(cols, species, transcript_dict):
 transcript = feature_to_dict(cols)
+if 'biotype' in transcript and transcript['biotype'] != 'protein_coding':
+return
 transcript.update({
 'object_type': 'Transcript',
 'seq_region_name': cols[0],
 'species': species,
 })
 parser.add_option('-l', action='store_true', default=False, dest='longestCDS', help='Keep only the longest CDS per gene')
 parser.add_option('--headers', action='store_true', default=False, help='Change the header line of the FASTA sequences to the >TranscriptId_species format')
 parser.add_option('--regions', default="", help='Comma-separated list of region IDs for which FASTA sequences should be filtered')
 parser.add_option('-o', '--output', help='Path of the output SQLite file')
 parser.add_option('--of', help='Path of the output FASTA file')
-parser.add_option('--ff', help='Path of the filtered sequences output FASTA file')
+parser.add_option('--ff', default=os.devnull, help='Path of the filtered sequences output FASTA file')
 options, args = parser.parse_args()
 if args:
 raise Exception('Use options to provide inputs')
 if options.longestCDS:
 found_gene_transcript = True
 else:
 break
-if gene_id in gene_transcripts_dict:
+gene_transcripts_dict.setdefault(gene_id, []).append((transcript_id, len(entry.sequence)))
-gene_transcripts_dict[gene_id].append((transcript_id, len(entry.sequence)))
-else:
-gene_transcripts_dict[gene_id] = [(transcript_id, len(entry.sequence))]
 if options.longestCDS:
 # For each gene, select the transcript with the longest sequence.
 # If more than one transcripts have the same longest sequence for a
 # gene, the first one to appear in the FASTA file is selected

Mercurial > repos > earlhaminst > gstf_preparation

comparison gstf_preparation.py @ 10:e8e75a79de59 draft