gstf_preparation: gstf_preparation.py comparison

comparison gstf_preparation.py @ 8:92f3966d5bc3 draft

planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 88ba62ae8c3d9587a0015c72209242ad0c1df0c2

author	earlhaminst
date	Wed, 16 May 2018 20:03:57 -0400
parents	56bbdbfe3eaa
children	f4acbfe8d6fe

comparison

equal deleted inserted replaced

-:9ef7661e8e9c
+:92f3966d5bc3
 from __future__ import print_function
-import collections
 import json
 import optparse
 import sqlite3
 import sys
 version = "0.4.0"
 gene_count = 0
-Sequence = collections.namedtuple('Sequence', ['header', 'sequence'])
+class Sequence(object):
+def __init__(self, header, sequence_parts):
+self.header = header
+self.sequence_parts = sequence_parts
+self._sequence = None
+@property
+def sequence(self):
+if self._sequence is None:
+self._sequence = ''.join(self.sequence_parts)
+return self._sequence
+def print(self, fh=sys.stdout):
+print(self.header, file=fh)
+for line in self.sequence_parts:
+print(line, file=fh)
 def FASTAReader_gen(fasta_filename):
 with open(fasta_filename) as fasta_file:
 line = fasta_file.readline()
 sequence_parts = []
 line = fasta_file.readline()
 while line and line[0] != '>':
 sequence_parts.append(line.rstrip())
 line = fasta_file.readline()
-sequence = "\n".join(sequence_parts)
+yield Sequence(header, sequence_parts)
-yield Sequence(header, sequence)
 def create_tables(conn):
 cur = conn.cursor()
 for fasta_arg in options.fasta:
 for entry in FASTAReader_gen(fasta_arg):
 # Extract the transcript id by removing everything after the first space and then removing the version if it is an Ensembl id
 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0])
+if len(entry.sequence) % 3 != 0:
+print("Transcript '%s' in file '%s' has a coding sequence length which is not multiple of 3" % (transcript_id, fasta_arg), file=sys.stderr)
+continue
 gene_id = fetch_gene_id_for_transcript(conn, transcript_id)
 if not gene_id:
 print("Transcript '%s' in file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr)
 continue
 for entry in FASTAReader_gen(fasta_arg):
 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0])
 if options.longestCDS and transcript_id not in selected_transcript_ids:
 continue
+if len(entry.sequence) % 3 != 0:
+print("Transcript '%s' in file '%s' has a coding sequence length which is not multiple of 3" % (transcript_id, fasta_arg), file=sys.stderr)
+continue
 species_for_transcript, seq_region_for_transcript = fetch_species_and_seq_region_for_transcript(conn, transcript_id)
 if not species_for_transcript:
 print("Transcript '%s' in file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr)
 continue
 if options.headers:
 # Change the FASTA header to '>TranscriptId_species', as required by TreeBest
 # Remove any underscore in the species
-header = ">%s_%s" % (transcript_id, species_for_transcript.replace('_', ''))
+entry.header = ">%s_%s" % (transcript_id, species_for_transcript.replace('_', ''))
+if seq_region_for_transcript.lower() in regions:
+entry.print(filtered_fasta_file)
 else:
-header = entry.header
+entry.print(output_fasta_file)
-if seq_region_for_transcript.lower() in regions:
-filtered_fasta_file.write("%s\n%s\n" % (header, entry.sequence))
-else:
-output_fasta_file.write("%s\n%s\n" % (header, entry.sequence))
 conn.close()
 if __name__ == '__main__':

Mercurial > repos > earlhaminst > gstf_preparation

comparison gstf_preparation.py @ 8:92f3966d5bc3 draft