Mercurial > repos > earlhaminst > gstf_preparation
comparison gstf_preparation.py @ 8:92f3966d5bc3 draft
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 88ba62ae8c3d9587a0015c72209242ad0c1df0c2
author | earlhaminst |
---|---|
date | Wed, 16 May 2018 20:03:57 -0400 |
parents | 56bbdbfe3eaa |
children | f4acbfe8d6fe |
comparison
equal
deleted
inserted
replaced
7:9ef7661e8e9c | 8:92f3966d5bc3 |
---|---|
1 from __future__ import print_function | 1 from __future__ import print_function |
2 | 2 |
3 import collections | |
4 import json | 3 import json |
5 import optparse | 4 import optparse |
6 import sqlite3 | 5 import sqlite3 |
7 import sys | 6 import sys |
8 | 7 |
9 version = "0.4.0" | 8 version = "0.4.0" |
10 gene_count = 0 | 9 gene_count = 0 |
11 | 10 |
12 Sequence = collections.namedtuple('Sequence', ['header', 'sequence']) | 11 |
12 class Sequence(object): | |
13 def __init__(self, header, sequence_parts): | |
14 self.header = header | |
15 self.sequence_parts = sequence_parts | |
16 self._sequence = None | |
17 | |
18 @property | |
19 def sequence(self): | |
20 if self._sequence is None: | |
21 self._sequence = ''.join(self.sequence_parts) | |
22 return self._sequence | |
23 | |
24 def print(self, fh=sys.stdout): | |
25 print(self.header, file=fh) | |
26 for line in self.sequence_parts: | |
27 print(line, file=fh) | |
13 | 28 |
14 | 29 |
15 def FASTAReader_gen(fasta_filename): | 30 def FASTAReader_gen(fasta_filename): |
16 with open(fasta_filename) as fasta_file: | 31 with open(fasta_filename) as fasta_file: |
17 line = fasta_file.readline() | 32 line = fasta_file.readline() |
23 sequence_parts = [] | 38 sequence_parts = [] |
24 line = fasta_file.readline() | 39 line = fasta_file.readline() |
25 while line and line[0] != '>': | 40 while line and line[0] != '>': |
26 sequence_parts.append(line.rstrip()) | 41 sequence_parts.append(line.rstrip()) |
27 line = fasta_file.readline() | 42 line = fasta_file.readline() |
28 sequence = "\n".join(sequence_parts) | 43 yield Sequence(header, sequence_parts) |
29 yield Sequence(header, sequence) | |
30 | 44 |
31 | 45 |
32 def create_tables(conn): | 46 def create_tables(conn): |
33 cur = conn.cursor() | 47 cur = conn.cursor() |
34 | 48 |
358 for fasta_arg in options.fasta: | 372 for fasta_arg in options.fasta: |
359 for entry in FASTAReader_gen(fasta_arg): | 373 for entry in FASTAReader_gen(fasta_arg): |
360 # Extract the transcript id by removing everything after the first space and then removing the version if it is an Ensembl id | 374 # Extract the transcript id by removing everything after the first space and then removing the version if it is an Ensembl id |
361 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0]) | 375 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0]) |
362 | 376 |
377 if len(entry.sequence) % 3 != 0: | |
378 print("Transcript '%s' in file '%s' has a coding sequence length which is not multiple of 3" % (transcript_id, fasta_arg), file=sys.stderr) | |
379 continue | |
380 | |
363 gene_id = fetch_gene_id_for_transcript(conn, transcript_id) | 381 gene_id = fetch_gene_id_for_transcript(conn, transcript_id) |
364 if not gene_id: | 382 if not gene_id: |
365 print("Transcript '%s' in file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr) | 383 print("Transcript '%s' in file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr) |
366 continue | 384 continue |
367 | 385 |
381 for entry in FASTAReader_gen(fasta_arg): | 399 for entry in FASTAReader_gen(fasta_arg): |
382 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0]) | 400 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0]) |
383 if options.longestCDS and transcript_id not in selected_transcript_ids: | 401 if options.longestCDS and transcript_id not in selected_transcript_ids: |
384 continue | 402 continue |
385 | 403 |
404 if len(entry.sequence) % 3 != 0: | |
405 print("Transcript '%s' in file '%s' has a coding sequence length which is not multiple of 3" % (transcript_id, fasta_arg), file=sys.stderr) | |
406 continue | |
407 | |
386 species_for_transcript, seq_region_for_transcript = fetch_species_and_seq_region_for_transcript(conn, transcript_id) | 408 species_for_transcript, seq_region_for_transcript = fetch_species_and_seq_region_for_transcript(conn, transcript_id) |
387 if not species_for_transcript: | 409 if not species_for_transcript: |
388 print("Transcript '%s' in file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr) | 410 print("Transcript '%s' in file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr) |
389 continue | 411 continue |
390 | 412 |
391 if options.headers: | 413 if options.headers: |
392 # Change the FASTA header to '>TranscriptId_species', as required by TreeBest | 414 # Change the FASTA header to '>TranscriptId_species', as required by TreeBest |
393 # Remove any underscore in the species | 415 # Remove any underscore in the species |
394 header = ">%s_%s" % (transcript_id, species_for_transcript.replace('_', '')) | 416 entry.header = ">%s_%s" % (transcript_id, species_for_transcript.replace('_', '')) |
417 | |
418 if seq_region_for_transcript.lower() in regions: | |
419 entry.print(filtered_fasta_file) | |
395 else: | 420 else: |
396 header = entry.header | 421 entry.print(output_fasta_file) |
397 | |
398 if seq_region_for_transcript.lower() in regions: | |
399 filtered_fasta_file.write("%s\n%s\n" % (header, entry.sequence)) | |
400 else: | |
401 output_fasta_file.write("%s\n%s\n" % (header, entry.sequence)) | |
402 | 422 |
403 conn.close() | 423 conn.close() |
404 | 424 |
405 | 425 |
406 if __name__ == '__main__': | 426 if __name__ == '__main__': |