Mercurial > repos > earlhaminst > gstf_preparation
changeset 4:284f64ad9d43 draft
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
author | earlhaminst |
---|---|
date | Fri, 08 Dec 2017 05:32:12 -0500 |
parents | 7e11a7f4bdba |
children | b3ba0c84667c |
files | gstf_preparation.py gstf_preparation.xml test-data/test1_longest.fasta |
diffstat | 3 files changed, 135 insertions(+), 7 deletions(-) [+] |
line wrap: on
line diff
--- a/gstf_preparation.py Fri Nov 24 12:32:39 2017 -0500 +++ b/gstf_preparation.py Fri Dec 08 05:32:12 2017 -0500 @@ -251,6 +251,17 @@ return results[0] +def fetch_gene_id_for_transcript(conn, transcript_id): + cur = conn.cursor() + + cur.execute('SELECT gene_id FROM transcript WHERE transcript_id=?', + (transcript_id, )) + results = cur.fetchone() + if not results: + return None + return results[0] + + def remove_id_version(s): """ Remove the optional '.VERSION' from an Ensembl id. @@ -266,6 +277,8 @@ parser.add_option('--gff3', action='append', default=[], help='GFF3 file to convert, in SPECIES:FILENAME format. Use multiple times to add more files') parser.add_option('--json', action='append', default=[], help='JSON file to merge. Use multiple times to add more files') parser.add_option('--fasta', action='append', default=[], help='Path of the input FASTA files') + parser.add_option('-l', action='store_true', default=False, dest='longestCDS', help='Keep only the longest CDS per gene') + parser.add_option('--headers', action='store_true', default=False, help='Change the header line of the FASTA sequences to the >TranscriptId_species format') parser.add_option('-o', '--output', help='Path of the output SQLite file') parser.add_option('--of', help='Path of the output FASTA file') options, args = parser.parse_args() @@ -287,6 +300,7 @@ cds_parent_dict = dict() five_prime_utr_parent_dict = dict() three_prime_utr_parent_dict = dict() + with open(filename) as f: for i, line in enumerate(f, start=1): line = line.strip() @@ -325,19 +339,47 @@ with open(json_arg) as f: write_gene_dict_to_db(conn, json.load(f)) - with open(options.of, 'w') as output_fasta_file: + if options.longestCDS: + gene_transcripts_dict = dict() for fasta_arg in options.fasta: for entry in FASTAReader_gen(fasta_arg): # Extract the transcript id by removing everything after the first space and then removing the version if it is an Ensembl id transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0]) + + gene_id = fetch_gene_id_for_transcript(conn, transcript_id) + if not gene_id: + continue + + if gene_id in gene_transcripts_dict: + gene_transcripts_dict[gene_id].append((transcript_id, len(entry.sequence))) + else: + gene_transcripts_dict[gene_id] = [(transcript_id, len(entry.sequence))] + + # For each gene, select the transcript with the longest sequence + # If more than one transcripts have the same longest sequence for a gene, the + # first one to appear in the FASTA file is selected + selected_transcript_ids = [max(transcript_id_lengths, key=lambda _: _[1])[0] for transcript_id_lengths in gene_transcripts_dict.values()] + + with open(options.of, 'w') as output_fasta_file: + for fasta_arg in options.fasta: + for entry in FASTAReader_gen(fasta_arg): + transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0]) + if options.longestCDS and transcript_id not in selected_transcript_ids: + continue + species_for_transcript = fetch_species_for_transcript(conn, transcript_id) if not species_for_transcript: print("Transcript '%s' not found in the gene feature information" % transcript_id, file=sys.stderr) continue - # Remove any underscore in the species - species_for_transcript = species_for_transcript.replace('_', '') - # Write the FASTA sequence using '>TranscriptId_species' as the header, as required by TreeBest - output_fasta_file.write(">%s_%s\n%s\n" % (transcript_id, species_for_transcript, entry.sequence)) + + if options.headers: + # Change the FASTA header to '>TranscriptId_species', as required by TreeBest + # Remove any underscore in the species + header = ">%s_%s" % (transcript_id, species_for_transcript.replace('_', '')) + else: + header = entry.header + + output_fasta_file.write("%s\n%s\n" % (header, entry.sequence)) conn.close()
--- a/gstf_preparation.xml Fri Nov 24 12:32:39 2017 -0500 +++ b/gstf_preparation.xml Fri Dec 08 05:32:12 2017 -0500 @@ -1,4 +1,4 @@ -<tool id="gstf_preparation" name="GeneSeqToFamily preparation" version="0.3.0"> +<tool id="gstf_preparation" name="GeneSeqToFamily preparation" version="0.4.0"> <description>converts data for the workflow</description> <command detect_errors="exit_code"> <![CDATA[ @@ -14,6 +14,12 @@ #for $fasta_input in $fasta_inputs --fasta '${fasta_input}' #end for +#if $headers + --headers +#end if +#if $longestCDS + -l +#end if -o '$output_db' --of '$output_fasta' ]]> @@ -28,6 +34,8 @@ </repeat> <param name="json" type="data" format="json" multiple="true" optional="true" label="Gene features in JSON format generated by 'Get features by Ensembl ID' tool" /> <param name="fasta_inputs" type="data" format="fasta" multiple="true" label="Corresponding FASTA datasets" help="Each FASTA header line should start with a transcript id" /> + <param name="longestCDS" type="boolean" checked="false" label="Keep only the longest CDS per gene" /> + <param name="headers" type="boolean" checked="true" label="Change the header line of the FASTA sequences to the >TranscriptId_species format" help="As required by TreeBest, part of the GeneSeqToFamily workflow" /> </inputs> <outputs> @@ -40,12 +48,37 @@ <param name="fasta_inputs" ftype="fasta" value="Caenorhabditis_elegans.WBcel235.cds.all.shortened.fa" /> <param name="gff3_input" ftype="gff3" value="Caenorhabditis_elegans.WBcel235.87.chromosome.I.shortened.gff3" /> <param name="genome" value="caenorhabditis_elegans" /> + <param name="longestCDS" value="false" /> + <param name="headers" value="true" /> + <output name="output_db" file="test1.sqlite" compare="sim_size" /> <output name="output_fasta" file="test1.fasta" /> </test> <test> + <param name="fasta_inputs" ftype="fasta" value="Caenorhabditis_elegans.WBcel235.cds.all.shortened.fa" /> + <param name="gff3_input" ftype="gff3" value="Caenorhabditis_elegans.WBcel235.87.chromosome.I.shortened.gff3" /> + <param name="genome" value="caenorhabditis_elegans" /> + <param name="longestCDS" value="true" /> + <param name="headers" value="true" /> + + <output name="output_db" file="test1.sqlite" compare="sim_size" /> + <output name="output_fasta" file="test1_longest.fasta" /> + </test> + <test> + <param name="fasta_inputs" ftype="fasta" value="Caenorhabditis_elegans.WBcel235.cds.all.shortened.fa" /> + <param name="gff3_input" ftype="gff3" value="Caenorhabditis_elegans.WBcel235.87.chromosome.I.shortened.gff3" /> + <param name="genome" value="caenorhabditis_elegans" /> + <param name="longestCDS" value="false" /> + <param name="headers" value="false" /> + + <output name="output_db" file="test1.sqlite" compare="sim_size" /> + <output name="output_fasta" file="Caenorhabditis_elegans.WBcel235.cds.all.shortened.fa" /> + </test> + <test> <param name="fasta_inputs" ftype="fasta" value="CDS.fasta" /> <param name="json" ftype="json" value="gene.json" /> + <param name="longestCDS" value="false" /> + <param name="headers" value="true" /> <output name="output_db" file="test2.sqlite" compare="sim_size" /> <output name="output_fasta" file="test2.fasta" /> @@ -55,7 +88,9 @@ <![CDATA[ **What it does** -This tool converts a set of GFF3 and/or JSON gene feature information datasets into SQLite format and modify the header lines of a corresponding CDS FASTA to be used with the GeneSeqToFamily workflow. +This tool converts a set of GFF3 and/or JSON gene feature information datasets into SQLite format. + +It also filters a CDS FASTA dataset to keep only the transcripts present in the gene feature information. Optionally it can also keep only the longest CDS per gene and/or change the header line of the FASTA sequences to the >TranscriptId_species format (as required by TreeBest, part of the GeneSeqToFamily workflow). Example GFF3 file::
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test1_longest.fasta Fri Dec 08 05:32:12 2017 -0500 @@ -0,0 +1,51 @@ +>Y74C9A.3_caenorhabditiselegans +ATGAGCTCTTCATCATCTTCTAGAATTCACAATGGTGAAGATGTTTATGAAAAGGCGGAG +GAATACTGGAGCCGCGCGAGCCAGGACGTCAACGGAATGCTCGGCGGATTCGAAGCGCTT +CACGCGCCCGACATATCGGCGTCGAAACGATTTATTGAAGGACTGAAGAAAAAGAATCTA +TTCGGCTACTTTGACTATGCACTGGACTGCGGAGCGGGTATCGGACGTGTTACAAAGCAT +CTCTTAATGCCATTCTTCTCGAAAGTTGATATGGAAGACGTCGTCGAGGAGTTGATCACG +AAAAGTGATCAATATATTGGAAAACATCCACGAATTGGAGATAAATTCGTCGAAGGACTG +CAGACGTTTGCACCGCCCGAACGACGTTATGATTTGATATGGATTCAATGGGTTTCAGGG +CATTTGGTTGATGAGGATTTGGTTGATTTCTTTAAAAGATGTGCGAAAGGACTGAAACCT +GGTGGATGTATTGTGCTCAAGGATAATGTGACAAATCACGAGAAACGGTTATTCGACGAT +GATGATCATAGTTGGACGAGAACAGAGCCCGAGCTTCTTAAAGCGTTCGCCGATTCTCAA +CTGGACATGGTCTCGAAAGCACTGCAAACCGGATTCCCAAAGGAGATTTATCCAGTAAAA +ATGTATGCATTGAAGCCTCAACACACCGGATTCACCAATAATTGA +>Y74C9A.2a.1_caenorhabditiselegans +ATGAAACTCGTAATTCTGCTATCTTTTGTTGCGACAGTTGCGGTTTTTGCGGCTCCATCG +GCTCCGGCAGGTCTCGAGGAGAAGCTGCGTGCTCTTCAGGAGCAACTGTACAGTCTGGAG +AAAGAGAACGGAGTTGATGTGAAGCAAAAGGAGCAACCAGCAGCAGCCGACACATTCCTT +GGATTTGTTCCACAGAAGAGAATGGTCGCGTGGCAGCCGATGAAGCGGTCGATGATCAAT +GAGGATTCTAGAGCTCCATTGCTCCACGCAATCGAAGCCCGCTTGGCCGAAGTGTTGAGA +GCCGGAGAACGCCTCGGAGTCAACCCGGAGGAAGTTTTGGCGGATCTTCGTGCTCGTAAT +CAATTCCAATAA +>Y74C9A.4b_caenorhabditiselegans +ATGGATTCGTACACGTCATCTGACGAAGACGCCTCTCGAAAAGAAAATATTTCAGACGAA +GGCTTGAATATGTTGAATGCATCGCCGGAGCCAATGGAGGAAGATGATCCAGAGGAGCAG +GCGGAACAAGAAGAAGAAACCAGCAGAATGGCTCGTCCTATAAGATCCATGAGAAAACGC +GAAACAACGTCTGGGGAATCAATGGGCGATGAGGATGAAGATTTGGAGGATGAAGAGGAC +GAAGATGAAGAAGCTGAAGCTCGTGAGCATCATGAAAGTGGTGCTCATGACACATCTTTC +TCAAATCCACTTTCCAACGTCGACAATCTAATCCACGTGGGAACCGAATATCAGGCGATT +ATACAGCCAACTGCAGAGCAATTGGAAAAAGAACCGTGCAGAGATCAACAAATTTGGGCG +TTTCCAGACGAAATGAACGAGAATCGGCTTACAGAATACATTTCAGAAGCTACTGGACGA +TATCAATTACCTATAGATAGGGCTCTGTTCATTCTGAACAAACAGTCAAATGATTTCGAC +GCTGCGATGGTTCAAGCGATGAGAAGAAAAGAAATTCATGATGATTGGACGGCAGAAGAA +ATTAGTCTTTTCTCCACTTGCTTCTTTCATTTCGGAAAACGGTTCAAGAAGATTCATGCG +GCTATGCCCCAACGCTCGCTTTCTTCCATTATCCAATACTATTACAACACGAAAAAAGTG +CAAAACTATAAAACAATGATTAATGTGCATTTGAATGAAACCGACACTTATGATGAACTA +TTCAAAGAGGTCAATCATTTGGAGAGGGTTCCGTCGGGATATTGTGAGAATTGCAATGCA +AAAAGTGATCTGTTGATTCTAAATCGTGTAATGTCGCGTCACGAATGTAAACCGTGTATC +CTTTATTTCCGTTTGATGCGTGTTCCACGTCCGGCAAGCCTCCGTGCACTGACAAAACGA +CGGCAACGAGTTTTATGTCCAGAATACATGAAAATTTATGTATACGGATATCTTGAGCTC +ATGGAGCCAGCCAACGGAAAAGCGATCAAACGGCTTGGAATTGGAAAAGAAAAAGAAGAA +GACGATGATATTATGGTGGTCGACGACTGCCTTCTCCGTAAACCATCAGGCCCCTACATT +GTGGAGCAATCGATTGAAGCTGATCCAATCGATGAGAATACGTGCAGAATGACACGGTGC +TTCGATACACCGGCTGCACTGGCATTAATTGATAATATCAAGAGAAAACATCATATGTGT +GTTCCACTTGTTTGGAGAGTTAAACAAACGAAATGTATGGAGGAGAACGAAATTCTGAAT +GAAGAAGCCCGTCAACAAATGTTCCGTGCAACAATGACATACAGCCGTGTACCAAAAGGA +GAAATTGCAAATTGGAAGAAAGATATGATGGCGTTGAAGGGAAGATTCGAGAGATTTACT +CCTGAACTAGATACTACTGCAACAAATGGCAATCGATCTGGAAAAGTCAGGATAAATTAT +GGTTGGAGTCCTGAGGAAAAGAAGAATGCTATTAGATGTTTCCACTGGTACAAGGACAAT +TTCGAGTTGATCGCCGAGTTGATGGCCACAAAAACTGTGGAACAAATCAAAAAGTTCTAT +ATGGACAATGAAAAGCTAATTTTGGAGTCAATCGACACGTATCGCGCCGAGCTCAAGTCA +AAACTCGGCAAATAA