Mercurial > repos > earlhaminst > gstf_preparation
comparison gstf_preparation.py @ 10:e8e75a79de59 draft
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9c8611fee927883f50bc6955771aa69df1ce8457"
author | earlhaminst |
---|---|
date | Thu, 31 Oct 2019 08:16:51 -0400 |
parents | f4acbfe8d6fe |
children | dbe37a658cd2 |
comparison
equal
deleted
inserted
replaced
9:f4acbfe8d6fe | 10:e8e75a79de59 |
---|---|
1 from __future__ import print_function | 1 from __future__ import print_function |
2 | 2 |
3 import json | 3 import json |
4 import optparse | 4 import optparse |
5 import os | |
5 import sqlite3 | 6 import sqlite3 |
6 import sys | 7 import sys |
7 | 8 |
8 version = "0.4.0" | 9 version = "0.4.0" |
9 gene_count = 0 | 10 gene_count = 0 |
112 if parent_dict is not None and 'Parent' in d: | 113 if parent_dict is not None and 'Parent' in d: |
113 # a 3' UTR can be split among multiple exons | 114 # a 3' UTR can be split among multiple exons |
114 # a 5' UTR can be split among multiple exons | 115 # a 5' UTR can be split among multiple exons |
115 # a CDS can be part of multiple transcripts | 116 # a CDS can be part of multiple transcripts |
116 for parent in d['Parent'].split(','): | 117 for parent in d['Parent'].split(','): |
117 if parent not in parent_dict: | 118 parent_dict.setdefault(parent, []).append(d) |
118 parent_dict[parent] = [d] | |
119 else: | |
120 parent_dict[parent].append(d) | |
121 return d | 119 return d |
122 | 120 |
123 | 121 |
124 def add_gene_to_dict(cols, species, gene_dict): | 122 def add_gene_to_dict(cols, species, gene_dict): |
125 global gene_count | 123 global gene_count |
137 gene_count = gene_count + 1 | 135 gene_count = gene_count + 1 |
138 | 136 |
139 | 137 |
140 def add_transcript_to_dict(cols, species, transcript_dict): | 138 def add_transcript_to_dict(cols, species, transcript_dict): |
141 transcript = feature_to_dict(cols) | 139 transcript = feature_to_dict(cols) |
140 if 'biotype' in transcript and transcript['biotype'] != 'protein_coding': | |
141 return | |
142 transcript.update({ | 142 transcript.update({ |
143 'object_type': 'Transcript', | 143 'object_type': 'Transcript', |
144 'seq_region_name': cols[0], | 144 'seq_region_name': cols[0], |
145 'species': species, | 145 'species': species, |
146 }) | 146 }) |
300 parser.add_option('-l', action='store_true', default=False, dest='longestCDS', help='Keep only the longest CDS per gene') | 300 parser.add_option('-l', action='store_true', default=False, dest='longestCDS', help='Keep only the longest CDS per gene') |
301 parser.add_option('--headers', action='store_true', default=False, help='Change the header line of the FASTA sequences to the >TranscriptId_species format') | 301 parser.add_option('--headers', action='store_true', default=False, help='Change the header line of the FASTA sequences to the >TranscriptId_species format') |
302 parser.add_option('--regions', default="", help='Comma-separated list of region IDs for which FASTA sequences should be filtered') | 302 parser.add_option('--regions', default="", help='Comma-separated list of region IDs for which FASTA sequences should be filtered') |
303 parser.add_option('-o', '--output', help='Path of the output SQLite file') | 303 parser.add_option('-o', '--output', help='Path of the output SQLite file') |
304 parser.add_option('--of', help='Path of the output FASTA file') | 304 parser.add_option('--of', help='Path of the output FASTA file') |
305 parser.add_option('--ff', help='Path of the filtered sequences output FASTA file') | 305 parser.add_option('--ff', default=os.devnull, help='Path of the filtered sequences output FASTA file') |
306 | 306 |
307 options, args = parser.parse_args() | 307 options, args = parser.parse_args() |
308 if args: | 308 if args: |
309 raise Exception('Use options to provide inputs') | 309 raise Exception('Use options to provide inputs') |
310 | 310 |
401 if options.longestCDS: | 401 if options.longestCDS: |
402 found_gene_transcript = True | 402 found_gene_transcript = True |
403 else: | 403 else: |
404 break | 404 break |
405 | 405 |
406 if gene_id in gene_transcripts_dict: | 406 gene_transcripts_dict.setdefault(gene_id, []).append((transcript_id, len(entry.sequence))) |
407 gene_transcripts_dict[gene_id].append((transcript_id, len(entry.sequence))) | |
408 else: | |
409 gene_transcripts_dict[gene_id] = [(transcript_id, len(entry.sequence))] | |
410 | 407 |
411 if options.longestCDS: | 408 if options.longestCDS: |
412 # For each gene, select the transcript with the longest sequence. | 409 # For each gene, select the transcript with the longest sequence. |
413 # If more than one transcripts have the same longest sequence for a | 410 # If more than one transcripts have the same longest sequence for a |
414 # gene, the first one to appear in the FASTA file is selected | 411 # gene, the first one to appear in the FASTA file is selected |