Mercurial > repos > earlhaminst > gstf_preparation
diff gstf_preparation.py @ 10:e8e75a79de59 draft
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9c8611fee927883f50bc6955771aa69df1ce8457"
author | earlhaminst |
---|---|
date | Thu, 31 Oct 2019 08:16:51 -0400 |
parents | f4acbfe8d6fe |
children | dbe37a658cd2 |
line wrap: on
line diff
--- a/gstf_preparation.py Wed Oct 17 07:31:29 2018 -0400 +++ b/gstf_preparation.py Thu Oct 31 08:16:51 2019 -0400 @@ -2,6 +2,7 @@ import json import optparse +import os import sqlite3 import sys @@ -114,10 +115,7 @@ # a 5' UTR can be split among multiple exons # a CDS can be part of multiple transcripts for parent in d['Parent'].split(','): - if parent not in parent_dict: - parent_dict[parent] = [d] - else: - parent_dict[parent].append(d) + parent_dict.setdefault(parent, []).append(d) return d @@ -139,6 +137,8 @@ def add_transcript_to_dict(cols, species, transcript_dict): transcript = feature_to_dict(cols) + if 'biotype' in transcript and transcript['biotype'] != 'protein_coding': + return transcript.update({ 'object_type': 'Transcript', 'seq_region_name': cols[0], @@ -302,7 +302,7 @@ parser.add_option('--regions', default="", help='Comma-separated list of region IDs for which FASTA sequences should be filtered') parser.add_option('-o', '--output', help='Path of the output SQLite file') parser.add_option('--of', help='Path of the output FASTA file') - parser.add_option('--ff', help='Path of the filtered sequences output FASTA file') + parser.add_option('--ff', default=os.devnull, help='Path of the filtered sequences output FASTA file') options, args = parser.parse_args() if args: @@ -403,10 +403,7 @@ else: break - if gene_id in gene_transcripts_dict: - gene_transcripts_dict[gene_id].append((transcript_id, len(entry.sequence))) - else: - gene_transcripts_dict[gene_id] = [(transcript_id, len(entry.sequence))] + gene_transcripts_dict.setdefault(gene_id, []).append((transcript_id, len(entry.sequence))) if options.longestCDS: # For each gene, select the transcript with the longest sequence.