# HG changeset patch
# User earlhaminst
# Date 1526515437 14400
# Node ID 92f3966d5bc3b75bdea120639891a499d2f76425
# Parent 9ef7661e8e9c63b72e3968fec080dec7d0abe1ab
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 88ba62ae8c3d9587a0015c72209242ad0c1df0c2
diff -r 9ef7661e8e9c -r 92f3966d5bc3 gstf_preparation.py
--- a/gstf_preparation.py Wed Apr 25 11:06:03 2018 -0400
+++ b/gstf_preparation.py Wed May 16 20:03:57 2018 -0400
@@ -1,6 +1,5 @@
from __future__ import print_function
-import collections
import json
import optparse
import sqlite3
@@ -9,7 +8,23 @@
version = "0.4.0"
gene_count = 0
-Sequence = collections.namedtuple('Sequence', ['header', 'sequence'])
+
+class Sequence(object):
+ def __init__(self, header, sequence_parts):
+ self.header = header
+ self.sequence_parts = sequence_parts
+ self._sequence = None
+
+ @property
+ def sequence(self):
+ if self._sequence is None:
+ self._sequence = ''.join(self.sequence_parts)
+ return self._sequence
+
+ def print(self, fh=sys.stdout):
+ print(self.header, file=fh)
+ for line in self.sequence_parts:
+ print(line, file=fh)
def FASTAReader_gen(fasta_filename):
@@ -25,8 +40,7 @@
while line and line[0] != '>':
sequence_parts.append(line.rstrip())
line = fasta_file.readline()
- sequence = "\n".join(sequence_parts)
- yield Sequence(header, sequence)
+ yield Sequence(header, sequence_parts)
def create_tables(conn):
@@ -360,6 +374,10 @@
# Extract the transcript id by removing everything after the first space and then removing the version if it is an Ensembl id
transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0])
+ if len(entry.sequence) % 3 != 0:
+ print("Transcript '%s' in file '%s' has a coding sequence length which is not multiple of 3" % (transcript_id, fasta_arg), file=sys.stderr)
+ continue
+
gene_id = fetch_gene_id_for_transcript(conn, transcript_id)
if not gene_id:
print("Transcript '%s' in file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr)
@@ -383,6 +401,10 @@
if options.longestCDS and transcript_id not in selected_transcript_ids:
continue
+ if len(entry.sequence) % 3 != 0:
+ print("Transcript '%s' in file '%s' has a coding sequence length which is not multiple of 3" % (transcript_id, fasta_arg), file=sys.stderr)
+ continue
+
species_for_transcript, seq_region_for_transcript = fetch_species_and_seq_region_for_transcript(conn, transcript_id)
if not species_for_transcript:
print("Transcript '%s' in file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr)
@@ -391,14 +413,12 @@
if options.headers:
# Change the FASTA header to '>TranscriptId_species', as required by TreeBest
# Remove any underscore in the species
- header = ">%s_%s" % (transcript_id, species_for_transcript.replace('_', ''))
- else:
- header = entry.header
+ entry.header = ">%s_%s" % (transcript_id, species_for_transcript.replace('_', ''))
if seq_region_for_transcript.lower() in regions:
- filtered_fasta_file.write("%s\n%s\n" % (header, entry.sequence))
+ entry.print(filtered_fasta_file)
else:
- output_fasta_file.write("%s\n%s\n" % (header, entry.sequence))
+ entry.print(output_fasta_file)
conn.close()
diff -r 9ef7661e8e9c -r 92f3966d5bc3 gstf_preparation.xml
--- a/gstf_preparation.xml Wed Apr 25 11:06:03 2018 -0400
+++ b/gstf_preparation.xml Wed May 16 20:03:57 2018 -0400
@@ -37,7 +37,7 @@
-
+
@@ -111,7 +111,12 @@
This tool converts a set of GFF3 and/or JSON gene feature information datasets into SQLite format.
-It also filters a CDS FASTA dataset to keep only the transcripts present in the gene feature information. Optionally it can also keep only the longest CDS per gene and/or change the header line of the FASTA sequences to the >TranscriptId_species format (as required by TreeBest, part of the GeneSeqToFamily workflow).
+It also filters the CDS FASTA datasets to:
+
+- remove coding sequences whose length is not a multiple of 3
+- keep only the transcripts present in the gene feature information.
+
+Optionally it can also keep only the longest CDS per gene and/or change the header line of the FASTA sequences to the >TranscriptId_species format (as required by TreeBest, part of the GeneSeqToFamily workflow).
Example GFF3 file::
diff -r 9ef7661e8e9c -r 92f3966d5bc3 test-data/test1.sqlite
Binary file test-data/test1.sqlite has changed
diff -r 9ef7661e8e9c -r 92f3966d5bc3 test-data/test4.fasta
--- a/test-data/test4.fasta Wed Apr 25 11:06:03 2018 -0400
+++ b/test-data/test4.fasta Wed May 16 20:03:57 2018 -0400
@@ -103,24 +103,6 @@
GTTCCAGCTTTTGAGATCACCCGCACCTTCTGGGAGAGAAACCTGCCTTCCGTGACCGGG
CTGCTTAAGATCATCGGATTTTCCACCTCGGTAACTGCCCTGTGGCTTGCCGTGTACAAA
TTCAGGCTGCTGACCCGATCCTGA
->ENSSSCT00000035258_susscrofa
-TCGCTGCCCGCCATCATGGGCTTCATCCTTGCTCGAAAAGCTGACCGCCTTGCTAAGGTT
-CATAAAGAAATAAGAAAGCGGAAAATCTGTGAGCTCTACGCCAAAGTGCTAGGATCTCAA
-GAAGCTTTACATCCCGTGCACTATGAAGAGAAGAACTGGTGTGAGGAGCAGTACTCGGGG
-GGCTGCTACACTGCCTACTTCCCCCCTGGGATCATGACTCAGTATGGAAGGGTGATCCGC
-CAGCCCGTAGGCAGGATATTCTTTGCTGGCACCGAGACTGCCACACAATGGAGCGGTTAC
-ATGGAAGGAGCAGTAGAAGCCGGCGAACGGGCGGCTAGAGAGGATGTTCCAGCTTTTGAG
-ATCACCCGCACCTTCTGGGAGAGAAACCTGCCTTCCGTGACCGGGCTGCTTAAGATCATC
-GGATTTTCCA
->ENSSSCT00000032764_susscrofa
-ATGGATGACATGGGAAAGAAGATTCCAGCTGATGCACCATGGGAGTCTCCGCATGCAGAG
-GAATGGGATAAGATGACCATGAAAGATCTCATCGATAAAATCTGTTGGACAAAGACTGCT
-AAACGGTTTGCATCTCTCTTTGTAAATATCAATGTGACCTCCGAACCCCACGAAGTGTCT
-GCCCTGTGGTTTTTGTGGTATGTGAAGCAGTGTGGAGGCACCACCCGGATATTCTCTGTT
-ACCAACGGGGGCCAGGAACGGAAGTTTGTAGGCGGATCTGGTCAAGTAAGCGAACGGATA
-ATGCACCTCCTCGGGGACAGAGTGAAGCTGAGGTGTCCTGTCACCTATGTTGACCAGTCA
-GGTGACAACATCATCGTAGAGACATTGAATCATGAACTTTATGAGTGCCAATACGTAATT
-AGTGCCATCCCTCCAACTCT
>ENSCAFT00000022939_canisfamiliaris
ATGGCGAGTAGAGAGAAGACGAGTATCGAGGGCCACATGTTTGACGTAGTCGTGATAGGA
GGCGGCATCTCAGGATTGTCTGCTGCCAAACTCTTAGCCGAACATGAAGTTGATGTCTTA
diff -r 9ef7661e8e9c -r 92f3966d5bc3 test-data/test4.sqlite
Binary file test-data/test4.sqlite has changed
diff -r 9ef7661e8e9c -r 92f3966d5bc3 test-data/test5.ns.fasta
--- a/test-data/test5.ns.fasta Wed Apr 25 11:06:03 2018 -0400
+++ b/test-data/test5.ns.fasta Wed May 16 20:03:57 2018 -0400
@@ -103,24 +103,6 @@
GTTCCAGCTTTTGAGATCACCCGCACCTTCTGGGAGAGAAACCTGCCTTCCGTGACCGGG
CTGCTTAAGATCATCGGATTTTCCACCTCGGTAACTGCCCTGTGGCTTGCCGTGTACAAA
TTCAGGCTGCTGACCCGATCCTGA
->ENSSSCT00000035258_susscrofa
-TCGCTGCCCGCCATCATGGGCTTCATCCTTGCTCGAAAAGCTGACCGCCTTGCTAAGGTT
-CATAAAGAAATAAGAAAGCGGAAAATCTGTGAGCTCTACGCCAAAGTGCTAGGATCTCAA
-GAAGCTTTACATCCCGTGCACTATGAAGAGAAGAACTGGTGTGAGGAGCAGTACTCGGGG
-GGCTGCTACACTGCCTACTTCCCCCCTGGGATCATGACTCAGTATGGAAGGGTGATCCGC
-CAGCCCGTAGGCAGGATATTCTTTGCTGGCACCGAGACTGCCACACAATGGAGCGGTTAC
-ATGGAAGGAGCAGTAGAAGCCGGCGAACGGGCGGCTAGAGAGGATGTTCCAGCTTTTGAG
-ATCACCCGCACCTTCTGGGAGAGAAACCTGCCTTCCGTGACCGGGCTGCTTAAGATCATC
-GGATTTTCCA
->ENSSSCT00000032764_susscrofa
-ATGGATGACATGGGAAAGAAGATTCCAGCTGATGCACCATGGGAGTCTCCGCATGCAGAG
-GAATGGGATAAGATGACCATGAAAGATCTCATCGATAAAATCTGTTGGACAAAGACTGCT
-AAACGGTTTGCATCTCTCTTTGTAAATATCAATGTGACCTCCGAACCCCACGAAGTGTCT
-GCCCTGTGGTTTTTGTGGTATGTGAAGCAGTGTGGAGGCACCACCCGGATATTCTCTGTT
-ACCAACGGGGGCCAGGAACGGAAGTTTGTAGGCGGATCTGGTCAAGTAAGCGAACGGATA
-ATGCACCTCCTCGGGGACAGAGTGAAGCTGAGGTGTCCTGTCACCTATGTTGACCAGTCA
-GGTGACAACATCATCGTAGAGACATTGAATCATGAACTTTATGAGTGCCAATACGTAATT
-AGTGCCATCCCTCCAACTCT
>ENSCAFT00000022939_canisfamiliaris
ATGGCGAGTAGAGAGAAGACGAGTATCGAGGGCCACATGTTTGACGTAGTCGTGATAGGA
GGCGGCATCTCAGGATTGTCTGCTGCCAAACTCTTAGCCGAACATGAAGTTGATGTCTTA
diff -r 9ef7661e8e9c -r 92f3966d5bc3 test-data/test5.sqlite
Binary file test-data/test5.sqlite has changed