Mercurial > repos > earlhaminst > gstf_preparation

--- a/gstf_preparation.py	Sun Sep 27 18:54:31 2020 +0000
+++ b/gstf_preparation.py	Mon Oct 05 13:33:59 2020 +0000
@@ -258,7 +258,7 @@
         if gene is None:
             # This can happen when loading a JSON file from Ensembl
             continue
-        if 'confidence' in gene and gene['confidence'] != 'high':
+        if 'confidence' in gene and gene['confidence'].lower() != 'high':
             print("Gene %s has confidence %s (not high), discarding" % (gene['id'], gene['confidence']), file=sys.stderr)
             continue
         gene_id = gene['id']
@@ -300,7 +300,7 @@
     parser.add_option('--fasta', action='append', default=[], help='Path of the input FASTA files')
     parser.add_option('--filter', type='choice', choices=['canonical', 'coding', ''], default='', help='Which transcripts to keep')
     parser.add_option('--headers', type='choice',
-                      choices=['TranscriptId_species', 'GeneSymbol-TranscriptID_species', 'TranscriptSymbol-TranscriptID_species', ''],
+                      choices=['TranscriptId_species', 'TranscriptID-GeneSymbol_species', 'TranscriptID-TranscriptSymbol_species', ''],
                       default='', help='Change the header line of the FASTA sequences to this format')
     parser.add_option('--regions', default="", help='Comma-separated list of region IDs for which FASTA sequences should be filtered')
     parser.add_option('-o', '--output', help='Path of the output SQLite file')
@@ -454,12 +454,12 @@
                     # Change the FASTA header to '>TranscriptId_species', as required by TreeBest
                     # Remove any underscore in the species
                     entry.header = ">%s_%s" % (transcript_id, transcript['species'].replace('_', ''))
-                elif options.headers == "GeneSymbol-TranscriptID_species":
+                elif options.headers == "TranscriptID-GeneSymbol_species":
                     # Remove any underscore in the species
-                    entry.header = ">%s-%s_%s" % (transcript['gene_symbol'], transcript_id, transcript['species'].replace('_', ''))
-                elif options.headers == "TranscriptSymbol-TranscriptID_species":
+                    entry.header = ">%s-%s_%s" % (transcript_id, transcript['gene_symbol'], transcript['species'].replace('_', ''))
+                elif options.headers == "TranscriptID-TranscriptSymbol_species":
                     # Remove any underscore in the species
-                    entry.header = ">%s-%s_%s" % (transcript['transcript_symbol'], transcript_id, transcript['species'].replace('_', ''))
+                    entry.header = ">%s-%s_%s" % (transcript_id, transcript['transcript_symbol'], transcript['species'].replace('_', ''))

                 if transcript['seq_region_name'].lower() in regions:
                     entry.print(filtered_fasta_file)
--- a/gstf_preparation.xml	Sun Sep 27 18:54:31 2020 +0000
+++ b/gstf_preparation.xml	Mon Oct 05 13:33:59 2020 +0000
@@ -1,4 +1,4 @@
-<tool id="gstf_preparation" name="GeneSeqToFamily preparation" version="0.4.2">
+<tool id="gstf_preparation" name="GeneSeqToFamily preparation" version="0.4.3">
     <description>converts data for the workflow</description>
     <requirements>
         <requirement type="package" version="3.7">python</requirement>
@@ -47,8 +47,8 @@

         <param name="headers" type="select" display="radio" label="Change the header line of the FASTA sequences to the following format" help="As required by TreeBest, part of the GeneSeqToFamily workflow, only TranscriptId_species is acceptable format by Aequatus visualisation">
             <option value="TranscriptId_species" selected="true">TranscriptId_species</option>
-            <option value="GeneSymbol-TranscriptID_species">GeneSymbol-TranscriptID_species</option>
-            <option value="TranscriptSymbol-TranscriptID_species">TranscriptSymbol-TranscriptID_species</option>
+            <option value="TranscriptID-GeneSymbol_species">GeneSymbol-TranscriptID_species</option>
+            <option value="TranscriptID-TranscriptSymbol_species">TranscriptSymbol-TranscriptID_species</option>
             <option value="">Don't change</option>
         </param>
         <param name="regions" type="text" optional="true" label="Comma-separated list of region IDs (e.g. chromosomes or scaffolds) for which FASTA sequences should be filtered out" help="Region IDs are in the `seqid` column for GFF3 and in the `seq_region_name` field in JSON. This is typically used to filter out chromosomes with a non-standard genetic code, like mitochondria, to be analysed separately" />