Mercurial > repos > earlhaminst > gstf_preparation
changeset 8:92f3966d5bc3 draft
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 88ba62ae8c3d9587a0015c72209242ad0c1df0c2
author | earlhaminst |
---|---|
date | Wed, 16 May 2018 20:03:57 -0400 |
parents | 9ef7661e8e9c |
children | f4acbfe8d6fe |
files | gstf_preparation.py gstf_preparation.xml test-data/test1.sqlite test-data/test4.fasta test-data/test4.sqlite test-data/test5.ns.fasta test-data/test5.sqlite |
diffstat | 7 files changed, 36 insertions(+), 47 deletions(-) [+] |
line wrap: on
line diff
--- a/gstf_preparation.py Wed Apr 25 11:06:03 2018 -0400 +++ b/gstf_preparation.py Wed May 16 20:03:57 2018 -0400 @@ -1,6 +1,5 @@ from __future__ import print_function -import collections import json import optparse import sqlite3 @@ -9,7 +8,23 @@ version = "0.4.0" gene_count = 0 -Sequence = collections.namedtuple('Sequence', ['header', 'sequence']) + +class Sequence(object): + def __init__(self, header, sequence_parts): + self.header = header + self.sequence_parts = sequence_parts + self._sequence = None + + @property + def sequence(self): + if self._sequence is None: + self._sequence = ''.join(self.sequence_parts) + return self._sequence + + def print(self, fh=sys.stdout): + print(self.header, file=fh) + for line in self.sequence_parts: + print(line, file=fh) def FASTAReader_gen(fasta_filename): @@ -25,8 +40,7 @@ while line and line[0] != '>': sequence_parts.append(line.rstrip()) line = fasta_file.readline() - sequence = "\n".join(sequence_parts) - yield Sequence(header, sequence) + yield Sequence(header, sequence_parts) def create_tables(conn): @@ -360,6 +374,10 @@ # Extract the transcript id by removing everything after the first space and then removing the version if it is an Ensembl id transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0]) + if len(entry.sequence) % 3 != 0: + print("Transcript '%s' in file '%s' has a coding sequence length which is not multiple of 3" % (transcript_id, fasta_arg), file=sys.stderr) + continue + gene_id = fetch_gene_id_for_transcript(conn, transcript_id) if not gene_id: print("Transcript '%s' in file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr) @@ -383,6 +401,10 @@ if options.longestCDS and transcript_id not in selected_transcript_ids: continue + if len(entry.sequence) % 3 != 0: + print("Transcript '%s' in file '%s' has a coding sequence length which is not multiple of 3" % (transcript_id, fasta_arg), file=sys.stderr) + continue + species_for_transcript, seq_region_for_transcript = fetch_species_and_seq_region_for_transcript(conn, transcript_id) if not species_for_transcript: print("Transcript '%s' in file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr) @@ -391,14 +413,12 @@ if options.headers: # Change the FASTA header to '>TranscriptId_species', as required by TreeBest # Remove any underscore in the species - header = ">%s_%s" % (transcript_id, species_for_transcript.replace('_', '')) - else: - header = entry.header + entry.header = ">%s_%s" % (transcript_id, species_for_transcript.replace('_', '')) if seq_region_for_transcript.lower() in regions: - filtered_fasta_file.write("%s\n%s\n" % (header, entry.sequence)) + entry.print(filtered_fasta_file) else: - output_fasta_file.write("%s\n%s\n" % (header, entry.sequence)) + entry.print(output_fasta_file) conn.close()
--- a/gstf_preparation.xml Wed Apr 25 11:06:03 2018 -0400 +++ b/gstf_preparation.xml Wed May 16 20:03:57 2018 -0400 @@ -37,7 +37,7 @@ </param> </repeat> <param name="json" type="data" format="json" multiple="true" optional="true" label="Gene features in JSON format generated by 'Get features by Ensembl ID' tool" /> - <param name="fasta_inputs" type="data" format="fasta" multiple="true" label="Corresponding FASTA datasets" help="Each FASTA header line should start with a transcript id" /> + <param name="fasta_inputs" type="data" format="fasta" multiple="true" label="Corresponding CDS datasets in FASTA format" help="Each FASTA header line should start with a transcript id" /> <param name="longestCDS" type="boolean" checked="false" label="Keep only the longest CDS per gene" /> <param name="headers" type="boolean" checked="true" label="Change the header line of the FASTA sequences to the >TranscriptId_species format" help="As required by TreeBest, part of the GeneSeqToFamily workflow" /> <param name="regions" type="text" optional="true" label="Comma-separated list of region IDs (e.g. chromosomes or scaffolds) for which FASTA sequences should be filtered" help="Region IDs are in the `seqid` column for GFF3 and in the `seq_region_name` field in JSON. This is typically used to filter chromosomes with a non-standard genetic code, like mitochondria, to be analysed separately" /> @@ -111,7 +111,12 @@ This tool converts a set of GFF3 and/or JSON gene feature information datasets into SQLite format. -It also filters a CDS FASTA dataset to keep only the transcripts present in the gene feature information. Optionally it can also keep only the longest CDS per gene and/or change the header line of the FASTA sequences to the >TranscriptId_species format (as required by TreeBest, part of the GeneSeqToFamily workflow). +It also filters the CDS FASTA datasets to: + +- remove coding sequences whose length is not a multiple of 3 +- keep only the transcripts present in the gene feature information. + +Optionally it can also keep only the longest CDS per gene and/or change the header line of the FASTA sequences to the >TranscriptId_species format (as required by TreeBest, part of the GeneSeqToFamily workflow). Example GFF3 file::
--- a/test-data/test4.fasta Wed Apr 25 11:06:03 2018 -0400 +++ b/test-data/test4.fasta Wed May 16 20:03:57 2018 -0400 @@ -103,24 +103,6 @@ GTTCCAGCTTTTGAGATCACCCGCACCTTCTGGGAGAGAAACCTGCCTTCCGTGACCGGG CTGCTTAAGATCATCGGATTTTCCACCTCGGTAACTGCCCTGTGGCTTGCCGTGTACAAA TTCAGGCTGCTGACCCGATCCTGA ->ENSSSCT00000035258_susscrofa -TCGCTGCCCGCCATCATGGGCTTCATCCTTGCTCGAAAAGCTGACCGCCTTGCTAAGGTT -CATAAAGAAATAAGAAAGCGGAAAATCTGTGAGCTCTACGCCAAAGTGCTAGGATCTCAA -GAAGCTTTACATCCCGTGCACTATGAAGAGAAGAACTGGTGTGAGGAGCAGTACTCGGGG -GGCTGCTACACTGCCTACTTCCCCCCTGGGATCATGACTCAGTATGGAAGGGTGATCCGC -CAGCCCGTAGGCAGGATATTCTTTGCTGGCACCGAGACTGCCACACAATGGAGCGGTTAC -ATGGAAGGAGCAGTAGAAGCCGGCGAACGGGCGGCTAGAGAGGATGTTCCAGCTTTTGAG -ATCACCCGCACCTTCTGGGAGAGAAACCTGCCTTCCGTGACCGGGCTGCTTAAGATCATC -GGATTTTCCA ->ENSSSCT00000032764_susscrofa -ATGGATGACATGGGAAAGAAGATTCCAGCTGATGCACCATGGGAGTCTCCGCATGCAGAG -GAATGGGATAAGATGACCATGAAAGATCTCATCGATAAAATCTGTTGGACAAAGACTGCT -AAACGGTTTGCATCTCTCTTTGTAAATATCAATGTGACCTCCGAACCCCACGAAGTGTCT -GCCCTGTGGTTTTTGTGGTATGTGAAGCAGTGTGGAGGCACCACCCGGATATTCTCTGTT -ACCAACGGGGGCCAGGAACGGAAGTTTGTAGGCGGATCTGGTCAAGTAAGCGAACGGATA -ATGCACCTCCTCGGGGACAGAGTGAAGCTGAGGTGTCCTGTCACCTATGTTGACCAGTCA -GGTGACAACATCATCGTAGAGACATTGAATCATGAACTTTATGAGTGCCAATACGTAATT -AGTGCCATCCCTCCAACTCT >ENSCAFT00000022939_canisfamiliaris ATGGCGAGTAGAGAGAAGACGAGTATCGAGGGCCACATGTTTGACGTAGTCGTGATAGGA GGCGGCATCTCAGGATTGTCTGCTGCCAAACTCTTAGCCGAACATGAAGTTGATGTCTTA
--- a/test-data/test5.ns.fasta Wed Apr 25 11:06:03 2018 -0400 +++ b/test-data/test5.ns.fasta Wed May 16 20:03:57 2018 -0400 @@ -103,24 +103,6 @@ GTTCCAGCTTTTGAGATCACCCGCACCTTCTGGGAGAGAAACCTGCCTTCCGTGACCGGG CTGCTTAAGATCATCGGATTTTCCACCTCGGTAACTGCCCTGTGGCTTGCCGTGTACAAA TTCAGGCTGCTGACCCGATCCTGA ->ENSSSCT00000035258_susscrofa -TCGCTGCCCGCCATCATGGGCTTCATCCTTGCTCGAAAAGCTGACCGCCTTGCTAAGGTT -CATAAAGAAATAAGAAAGCGGAAAATCTGTGAGCTCTACGCCAAAGTGCTAGGATCTCAA -GAAGCTTTACATCCCGTGCACTATGAAGAGAAGAACTGGTGTGAGGAGCAGTACTCGGGG -GGCTGCTACACTGCCTACTTCCCCCCTGGGATCATGACTCAGTATGGAAGGGTGATCCGC -CAGCCCGTAGGCAGGATATTCTTTGCTGGCACCGAGACTGCCACACAATGGAGCGGTTAC -ATGGAAGGAGCAGTAGAAGCCGGCGAACGGGCGGCTAGAGAGGATGTTCCAGCTTTTGAG -ATCACCCGCACCTTCTGGGAGAGAAACCTGCCTTCCGTGACCGGGCTGCTTAAGATCATC -GGATTTTCCA ->ENSSSCT00000032764_susscrofa -ATGGATGACATGGGAAAGAAGATTCCAGCTGATGCACCATGGGAGTCTCCGCATGCAGAG -GAATGGGATAAGATGACCATGAAAGATCTCATCGATAAAATCTGTTGGACAAAGACTGCT -AAACGGTTTGCATCTCTCTTTGTAAATATCAATGTGACCTCCGAACCCCACGAAGTGTCT -GCCCTGTGGTTTTTGTGGTATGTGAAGCAGTGTGGAGGCACCACCCGGATATTCTCTGTT -ACCAACGGGGGCCAGGAACGGAAGTTTGTAGGCGGATCTGGTCAAGTAAGCGAACGGATA -ATGCACCTCCTCGGGGACAGAGTGAAGCTGAGGTGTCCTGTCACCTATGTTGACCAGTCA -GGTGACAACATCATCGTAGAGACATTGAATCATGAACTTTATGAGTGCCAATACGTAATT -AGTGCCATCCCTCCAACTCT >ENSCAFT00000022939_canisfamiliaris ATGGCGAGTAGAGAGAAGACGAGTATCGAGGGCCACATGTTTGACGTAGTCGTGATAGGA GGCGGCATCTCAGGATTGTCTGCTGCCAAACTCTTAGCCGAACATGAAGTTGATGTCTTA