comparison gstf_preparation.py @ 10:e8e75a79de59 draft

"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9c8611fee927883f50bc6955771aa69df1ce8457"
author earlhaminst
date Thu, 31 Oct 2019 08:16:51 -0400
parents f4acbfe8d6fe
children dbe37a658cd2
comparison
equal deleted inserted replaced
9:f4acbfe8d6fe 10:e8e75a79de59
1 from __future__ import print_function 1 from __future__ import print_function
2 2
3 import json 3 import json
4 import optparse 4 import optparse
5 import os
5 import sqlite3 6 import sqlite3
6 import sys 7 import sys
7 8
8 version = "0.4.0" 9 version = "0.4.0"
9 gene_count = 0 10 gene_count = 0
112 if parent_dict is not None and 'Parent' in d: 113 if parent_dict is not None and 'Parent' in d:
113 # a 3' UTR can be split among multiple exons 114 # a 3' UTR can be split among multiple exons
114 # a 5' UTR can be split among multiple exons 115 # a 5' UTR can be split among multiple exons
115 # a CDS can be part of multiple transcripts 116 # a CDS can be part of multiple transcripts
116 for parent in d['Parent'].split(','): 117 for parent in d['Parent'].split(','):
117 if parent not in parent_dict: 118 parent_dict.setdefault(parent, []).append(d)
118 parent_dict[parent] = [d]
119 else:
120 parent_dict[parent].append(d)
121 return d 119 return d
122 120
123 121
124 def add_gene_to_dict(cols, species, gene_dict): 122 def add_gene_to_dict(cols, species, gene_dict):
125 global gene_count 123 global gene_count
137 gene_count = gene_count + 1 135 gene_count = gene_count + 1
138 136
139 137
140 def add_transcript_to_dict(cols, species, transcript_dict): 138 def add_transcript_to_dict(cols, species, transcript_dict):
141 transcript = feature_to_dict(cols) 139 transcript = feature_to_dict(cols)
140 if 'biotype' in transcript and transcript['biotype'] != 'protein_coding':
141 return
142 transcript.update({ 142 transcript.update({
143 'object_type': 'Transcript', 143 'object_type': 'Transcript',
144 'seq_region_name': cols[0], 144 'seq_region_name': cols[0],
145 'species': species, 145 'species': species,
146 }) 146 })
300 parser.add_option('-l', action='store_true', default=False, dest='longestCDS', help='Keep only the longest CDS per gene') 300 parser.add_option('-l', action='store_true', default=False, dest='longestCDS', help='Keep only the longest CDS per gene')
301 parser.add_option('--headers', action='store_true', default=False, help='Change the header line of the FASTA sequences to the >TranscriptId_species format') 301 parser.add_option('--headers', action='store_true', default=False, help='Change the header line of the FASTA sequences to the >TranscriptId_species format')
302 parser.add_option('--regions', default="", help='Comma-separated list of region IDs for which FASTA sequences should be filtered') 302 parser.add_option('--regions', default="", help='Comma-separated list of region IDs for which FASTA sequences should be filtered')
303 parser.add_option('-o', '--output', help='Path of the output SQLite file') 303 parser.add_option('-o', '--output', help='Path of the output SQLite file')
304 parser.add_option('--of', help='Path of the output FASTA file') 304 parser.add_option('--of', help='Path of the output FASTA file')
305 parser.add_option('--ff', help='Path of the filtered sequences output FASTA file') 305 parser.add_option('--ff', default=os.devnull, help='Path of the filtered sequences output FASTA file')
306 306
307 options, args = parser.parse_args() 307 options, args = parser.parse_args()
308 if args: 308 if args:
309 raise Exception('Use options to provide inputs') 309 raise Exception('Use options to provide inputs')
310 310
401 if options.longestCDS: 401 if options.longestCDS:
402 found_gene_transcript = True 402 found_gene_transcript = True
403 else: 403 else:
404 break 404 break
405 405
406 if gene_id in gene_transcripts_dict: 406 gene_transcripts_dict.setdefault(gene_id, []).append((transcript_id, len(entry.sequence)))
407 gene_transcripts_dict[gene_id].append((transcript_id, len(entry.sequence)))
408 else:
409 gene_transcripts_dict[gene_id] = [(transcript_id, len(entry.sequence))]
410 407
411 if options.longestCDS: 408 if options.longestCDS:
412 # For each gene, select the transcript with the longest sequence. 409 # For each gene, select the transcript with the longest sequence.
413 # If more than one transcripts have the same longest sequence for a 410 # If more than one transcripts have the same longest sequence for a
414 # gene, the first one to appear in the FASTA file is selected 411 # gene, the first one to appear in the FASTA file is selected