Mercurial > repos > earlhaminst > gstf_preparation

--- a/gstf_preparation.py	Fri Nov 24 12:32:39 2017 -0500
+++ b/gstf_preparation.py	Fri Dec 08 05:32:12 2017 -0500
@@ -251,6 +251,17 @@
     return results[0]


+def fetch_gene_id_for_transcript(conn, transcript_id):
+    cur = conn.cursor()
+
+    cur.execute('SELECT gene_id FROM transcript WHERE transcript_id=?',
+                (transcript_id, ))
+    results = cur.fetchone()
+    if not results:
+        return None
+    return results[0]
+
+
 def remove_id_version(s):
     """
     Remove the optional '.VERSION' from an Ensembl id.
@@ -266,6 +277,8 @@
     parser.add_option('--gff3', action='append', default=[], help='GFF3 file to convert, in SPECIES:FILENAME format. Use multiple times to add more files')
     parser.add_option('--json', action='append', default=[], help='JSON file to merge. Use multiple times to add more files')
     parser.add_option('--fasta', action='append', default=[], help='Path of the input FASTA files')
+    parser.add_option('-l', action='store_true', default=False, dest='longestCDS', help='Keep only the longest CDS per gene')
+    parser.add_option('--headers', action='store_true', default=False, help='Change the header line of the FASTA sequences to the >TranscriptId_species format')
     parser.add_option('-o', '--output', help='Path of the output SQLite file')
     parser.add_option('--of', help='Path of the output FASTA file')
     options, args = parser.parse_args()
@@ -287,6 +300,7 @@
         cds_parent_dict = dict()
         five_prime_utr_parent_dict = dict()
         three_prime_utr_parent_dict = dict()
+
         with open(filename) as f:
             for i, line in enumerate(f, start=1):
                 line = line.strip()
@@ -325,19 +339,47 @@
         with open(json_arg) as f:
             write_gene_dict_to_db(conn, json.load(f))

-    with open(options.of, 'w') as output_fasta_file:
+    if options.longestCDS:
+        gene_transcripts_dict = dict()
         for fasta_arg in options.fasta:
             for entry in FASTAReader_gen(fasta_arg):
                 # Extract the transcript id by removing everything after the first space and then removing the version if it is an Ensembl id
                 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0])
+
+                gene_id = fetch_gene_id_for_transcript(conn, transcript_id)
+                if not gene_id:
+                    continue
+
+                if gene_id in gene_transcripts_dict:
+                    gene_transcripts_dict[gene_id].append((transcript_id, len(entry.sequence)))
+                else:
+                    gene_transcripts_dict[gene_id] = [(transcript_id, len(entry.sequence))]
+
+        # For each gene, select the transcript with the longest sequence
+        # If more than one transcripts have the same longest sequence for a gene, the
+        # first one to appear in the FASTA file is selected
+        selected_transcript_ids = [max(transcript_id_lengths, key=lambda _: _[1])[0] for transcript_id_lengths in gene_transcripts_dict.values()]
+
+    with open(options.of, 'w') as output_fasta_file:
+        for fasta_arg in options.fasta:
+            for entry in FASTAReader_gen(fasta_arg):
+                transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0])
+                if options.longestCDS and transcript_id not in selected_transcript_ids:
+                    continue
+
                 species_for_transcript = fetch_species_for_transcript(conn, transcript_id)
                 if not species_for_transcript:
                     print("Transcript '%s' not found in the gene feature information" % transcript_id, file=sys.stderr)
                     continue
-                # Remove any underscore in the species
-                species_for_transcript = species_for_transcript.replace('_', '')
-                # Write the FASTA sequence using '>TranscriptId_species' as the header, as required by TreeBest
-                output_fasta_file.write(">%s_%s\n%s\n" % (transcript_id, species_for_transcript, entry.sequence))
+
+                if options.headers:
+                    # Change the FASTA header to '>TranscriptId_species', as required by TreeBest
+                    # Remove any underscore in the species
+                    header = ">%s_%s" % (transcript_id, species_for_transcript.replace('_', ''))
+                else:
+                    header = entry.header
+
+                output_fasta_file.write("%s\n%s\n" % (header, entry.sequence))

     conn.close()
--- a/gstf_preparation.xml	Fri Nov 24 12:32:39 2017 -0500
+++ b/gstf_preparation.xml	Fri Dec 08 05:32:12 2017 -0500
@@ -1,4 +1,4 @@
-<tool id="gstf_preparation" name="GeneSeqToFamily preparation" version="0.3.0">
+<tool id="gstf_preparation" name="GeneSeqToFamily preparation" version="0.4.0">
     <description>converts data for the workflow</description>
     <command detect_errors="exit_code">
 <![CDATA[
@@ -14,6 +14,12 @@
 #for $fasta_input in $fasta_inputs
     --fasta '${fasta_input}'
 #end for
+#if $headers
+    --headers
+#end if
+#if $longestCDS
+    -l
+#end if
 -o '$output_db'
 --of '$output_fasta'
 ]]>
@@ -28,6 +34,8 @@
         </repeat>
         <param name="json" type="data" format="json" multiple="true" optional="true" label="Gene features in JSON format generated by 'Get features by Ensembl ID' tool" />
         <param name="fasta_inputs" type="data" format="fasta" multiple="true" label="Corresponding FASTA datasets" help="Each FASTA header line should start with a transcript id" />
+        <param name="longestCDS" type="boolean" checked="false" label="Keep only the longest CDS per gene" />
+        <param name="headers" type="boolean" checked="true" label="Change the header line of the FASTA sequences to the &gt;TranscriptId_species format" help="As required by TreeBest, part of the GeneSeqToFamily workflow" />
     </inputs>

     <outputs>
@@ -40,12 +48,37 @@
             <param name="fasta_inputs" ftype="fasta" value="Caenorhabditis_elegans.WBcel235.cds.all.shortened.fa" />
             <param name="gff3_input" ftype="gff3" value="Caenorhabditis_elegans.WBcel235.87.chromosome.I.shortened.gff3" />
             <param name="genome" value="caenorhabditis_elegans" />
+            <param name="longestCDS" value="false" />
+            <param name="headers" value="true" />
+
             <output name="output_db" file="test1.sqlite" compare="sim_size" />
             <output name="output_fasta" file="test1.fasta" />
         </test>
         <test>
+            <param name="fasta_inputs" ftype="fasta" value="Caenorhabditis_elegans.WBcel235.cds.all.shortened.fa" />
+            <param name="gff3_input" ftype="gff3" value="Caenorhabditis_elegans.WBcel235.87.chromosome.I.shortened.gff3" />
+            <param name="genome" value="caenorhabditis_elegans" />
+            <param name="longestCDS" value="true" />
+            <param name="headers" value="true" />
+
+            <output name="output_db" file="test1.sqlite" compare="sim_size" />
+            <output name="output_fasta" file="test1_longest.fasta" />
+        </test>
+        <test>
+            <param name="fasta_inputs" ftype="fasta" value="Caenorhabditis_elegans.WBcel235.cds.all.shortened.fa" />
+            <param name="gff3_input" ftype="gff3" value="Caenorhabditis_elegans.WBcel235.87.chromosome.I.shortened.gff3" />
+            <param name="genome" value="caenorhabditis_elegans" />
+            <param name="longestCDS" value="false" />
+            <param name="headers" value="false" />
+
+            <output name="output_db" file="test1.sqlite" compare="sim_size" />
+            <output name="output_fasta" file="Caenorhabditis_elegans.WBcel235.cds.all.shortened.fa" />
+        </test>
+        <test>
             <param name="fasta_inputs" ftype="fasta" value="CDS.fasta" />
             <param name="json" ftype="json" value="gene.json" />
+            <param name="longestCDS" value="false" />
+            <param name="headers" value="true" />

             <output name="output_db" file="test2.sqlite" compare="sim_size" />
             <output name="output_fasta" file="test2.fasta" />
@@ -55,7 +88,9 @@
 <![CDATA[
 **What it does**

-This tool converts a set of GFF3 and/or JSON gene feature information datasets into SQLite format and modify the header lines of a corresponding CDS FASTA to be used with the GeneSeqToFamily workflow.
+This tool converts a set of GFF3 and/or JSON gene feature information datasets into SQLite format.
+
+It also filters a CDS FASTA dataset to keep only the transcripts present in the gene feature information. Optionally it can also keep only the longest CDS per gene and/or change the header line of the FASTA sequences to the >TranscriptId_species format (as required by TreeBest, part of the GeneSeqToFamily workflow).

 Example GFF3 file::
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test1_longest.fasta	Fri Dec 08 05:32:12 2017 -0500
@@ -0,0 +1,51 @@
+>Y74C9A.3_caenorhabditiselegans
+ATGAGCTCTTCATCATCTTCTAGAATTCACAATGGTGAAGATGTTTATGAAAAGGCGGAG
+GAATACTGGAGCCGCGCGAGCCAGGACGTCAACGGAATGCTCGGCGGATTCGAAGCGCTT
+CACGCGCCCGACATATCGGCGTCGAAACGATTTATTGAAGGACTGAAGAAAAAGAATCTA
+TTCGGCTACTTTGACTATGCACTGGACTGCGGAGCGGGTATCGGACGTGTTACAAAGCAT
+CTCTTAATGCCATTCTTCTCGAAAGTTGATATGGAAGACGTCGTCGAGGAGTTGATCACG
+AAAAGTGATCAATATATTGGAAAACATCCACGAATTGGAGATAAATTCGTCGAAGGACTG
+CAGACGTTTGCACCGCCCGAACGACGTTATGATTTGATATGGATTCAATGGGTTTCAGGG
+CATTTGGTTGATGAGGATTTGGTTGATTTCTTTAAAAGATGTGCGAAAGGACTGAAACCT
+GGTGGATGTATTGTGCTCAAGGATAATGTGACAAATCACGAGAAACGGTTATTCGACGAT
+GATGATCATAGTTGGACGAGAACAGAGCCCGAGCTTCTTAAAGCGTTCGCCGATTCTCAA
+CTGGACATGGTCTCGAAAGCACTGCAAACCGGATTCCCAAAGGAGATTTATCCAGTAAAA
+ATGTATGCATTGAAGCCTCAACACACCGGATTCACCAATAATTGA
+>Y74C9A.2a.1_caenorhabditiselegans
+ATGAAACTCGTAATTCTGCTATCTTTTGTTGCGACAGTTGCGGTTTTTGCGGCTCCATCG
+GCTCCGGCAGGTCTCGAGGAGAAGCTGCGTGCTCTTCAGGAGCAACTGTACAGTCTGGAG
+AAAGAGAACGGAGTTGATGTGAAGCAAAAGGAGCAACCAGCAGCAGCCGACACATTCCTT
+GGATTTGTTCCACAGAAGAGAATGGTCGCGTGGCAGCCGATGAAGCGGTCGATGATCAAT
+GAGGATTCTAGAGCTCCATTGCTCCACGCAATCGAAGCCCGCTTGGCCGAAGTGTTGAGA
+GCCGGAGAACGCCTCGGAGTCAACCCGGAGGAAGTTTTGGCGGATCTTCGTGCTCGTAAT
+CAATTCCAATAA
+>Y74C9A.4b_caenorhabditiselegans
+ATGGATTCGTACACGTCATCTGACGAAGACGCCTCTCGAAAAGAAAATATTTCAGACGAA
+GGCTTGAATATGTTGAATGCATCGCCGGAGCCAATGGAGGAAGATGATCCAGAGGAGCAG
+GCGGAACAAGAAGAAGAAACCAGCAGAATGGCTCGTCCTATAAGATCCATGAGAAAACGC
+GAAACAACGTCTGGGGAATCAATGGGCGATGAGGATGAAGATTTGGAGGATGAAGAGGAC
+GAAGATGAAGAAGCTGAAGCTCGTGAGCATCATGAAAGTGGTGCTCATGACACATCTTTC
+TCAAATCCACTTTCCAACGTCGACAATCTAATCCACGTGGGAACCGAATATCAGGCGATT
+ATACAGCCAACTGCAGAGCAATTGGAAAAAGAACCGTGCAGAGATCAACAAATTTGGGCG
+TTTCCAGACGAAATGAACGAGAATCGGCTTACAGAATACATTTCAGAAGCTACTGGACGA
+TATCAATTACCTATAGATAGGGCTCTGTTCATTCTGAACAAACAGTCAAATGATTTCGAC
+GCTGCGATGGTTCAAGCGATGAGAAGAAAAGAAATTCATGATGATTGGACGGCAGAAGAA
+ATTAGTCTTTTCTCCACTTGCTTCTTTCATTTCGGAAAACGGTTCAAGAAGATTCATGCG
+GCTATGCCCCAACGCTCGCTTTCTTCCATTATCCAATACTATTACAACACGAAAAAAGTG
+CAAAACTATAAAACAATGATTAATGTGCATTTGAATGAAACCGACACTTATGATGAACTA
+TTCAAAGAGGTCAATCATTTGGAGAGGGTTCCGTCGGGATATTGTGAGAATTGCAATGCA
+AAAAGTGATCTGTTGATTCTAAATCGTGTAATGTCGCGTCACGAATGTAAACCGTGTATC
+CTTTATTTCCGTTTGATGCGTGTTCCACGTCCGGCAAGCCTCCGTGCACTGACAAAACGA
+CGGCAACGAGTTTTATGTCCAGAATACATGAAAATTTATGTATACGGATATCTTGAGCTC
+ATGGAGCCAGCCAACGGAAAAGCGATCAAACGGCTTGGAATTGGAAAAGAAAAAGAAGAA
+GACGATGATATTATGGTGGTCGACGACTGCCTTCTCCGTAAACCATCAGGCCCCTACATT
+GTGGAGCAATCGATTGAAGCTGATCCAATCGATGAGAATACGTGCAGAATGACACGGTGC
+TTCGATACACCGGCTGCACTGGCATTAATTGATAATATCAAGAGAAAACATCATATGTGT
+GTTCCACTTGTTTGGAGAGTTAAACAAACGAAATGTATGGAGGAGAACGAAATTCTGAAT
+GAAGAAGCCCGTCAACAAATGTTCCGTGCAACAATGACATACAGCCGTGTACCAAAAGGA
+GAAATTGCAAATTGGAAGAAAGATATGATGGCGTTGAAGGGAAGATTCGAGAGATTTACT
+CCTGAACTAGATACTACTGCAACAAATGGCAATCGATCTGGAAAAGTCAGGATAAATTAT
+GGTTGGAGTCCTGAGGAAAAGAAGAATGCTATTAGATGTTTCCACTGGTACAAGGACAAT
+TTCGAGTTGATCGCCGAGTTGATGGCCACAAAAACTGTGGAACAAATCAAAAAGTTCTAT
+ATGGACAATGAAAAGCTAATTTTGGAGTCAATCGACACGTATCGCGCCGAGCTCAAGTCA
+AAACTCGGCAAATAA