gstf_preparation: gstf_preparation.py comparison

comparison gstf_preparation.py @ 4:284f64ad9d43 draft

planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty

author	earlhaminst
date	Fri, 08 Dec 2017 05:32:12 -0500
parents	7e11a7f4bdba
children	b3ba0c84667c

comparison

equal deleted inserted replaced

-:7e11a7f4bdba
+:284f64ad9d43
 if not results:
 return None
 return results[0]
+def fetch_gene_id_for_transcript(conn, transcript_id):
+cur = conn.cursor()
+cur.execute('SELECT gene_id FROM transcript WHERE transcript_id=?',
+(transcript_id, ))
+results = cur.fetchone()
+if not results:
+return None
+return results[0]
 def remove_id_version(s):
 """
 Remove the optional '.VERSION' from an Ensembl id.
 """
 if s.startswith('ENS'):
 def __main__():
 parser = optparse.OptionParser()
 parser.add_option('--gff3', action='append', default=[], help='GFF3 file to convert, in SPECIES:FILENAME format. Use multiple times to add more files')
 parser.add_option('--json', action='append', default=[], help='JSON file to merge. Use multiple times to add more files')
 parser.add_option('--fasta', action='append', default=[], help='Path of the input FASTA files')
+parser.add_option('-l', action='store_true', default=False, dest='longestCDS', help='Keep only the longest CDS per gene')
+parser.add_option('--headers', action='store_true', default=False, help='Change the header line of the FASTA sequences to the >TranscriptId_species format')
 parser.add_option('-o', '--output', help='Path of the output SQLite file')
 parser.add_option('--of', help='Path of the output FASTA file')
 options, args = parser.parse_args()
 if args:
 raise Exception('Use options to provide inputs')
 transcript_dict = dict()
 exon_parent_dict = dict()
 cds_parent_dict = dict()
 five_prime_utr_parent_dict = dict()
 three_prime_utr_parent_dict = dict()
 with open(filename) as f:
 for i, line in enumerate(f, start=1):
 line = line.strip()
 if not line:
 # skip empty lines
 for json_arg in options.json:
 with open(json_arg) as f:
 write_gene_dict_to_db(conn, json.load(f))
-with open(options.of, 'w') as output_fasta_file:
+if options.longestCDS:
+gene_transcripts_dict = dict()
 for fasta_arg in options.fasta:
 for entry in FASTAReader_gen(fasta_arg):
 # Extract the transcript id by removing everything after the first space and then removing the version if it is an Ensembl id
 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0])
+gene_id = fetch_gene_id_for_transcript(conn, transcript_id)
+if not gene_id:
+continue
+if gene_id in gene_transcripts_dict:
+gene_transcripts_dict[gene_id].append((transcript_id, len(entry.sequence)))
+else:
+gene_transcripts_dict[gene_id] = [(transcript_id, len(entry.sequence))]
+# For each gene, select the transcript with the longest sequence
+# If more than one transcripts have the same longest sequence for a gene, the
+# first one to appear in the FASTA file is selected
+selected_transcript_ids = [max(transcript_id_lengths, key=lambda _: _[1])[0] for transcript_id_lengths in gene_transcripts_dict.values()]
+with open(options.of, 'w') as output_fasta_file:
+for fasta_arg in options.fasta:
+for entry in FASTAReader_gen(fasta_arg):
+transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0])
+if options.longestCDS and transcript_id not in selected_transcript_ids:
+continue
 species_for_transcript = fetch_species_for_transcript(conn, transcript_id)
 if not species_for_transcript:
 print("Transcript '%s' not found in the gene feature information" % transcript_id, file=sys.stderr)
 continue
-# Remove any underscore in the species
-species_for_transcript = species_for_transcript.replace('_', '')
+if options.headers:
-# Write the FASTA sequence using '>TranscriptId_species' as the header, as required by TreeBest
+# Change the FASTA header to '>TranscriptId_species', as required by TreeBest
-output_fasta_file.write(">%s_%s\n%s\n" % (transcript_id, species_for_transcript, entry.sequence))
+# Remove any underscore in the species
+header = ">%s_%s" % (transcript_id, species_for_transcript.replace('_', ''))
+else:
+header = entry.header
+output_fasta_file.write("%s\n%s\n" % (header, entry.sequence))
 conn.close()
 if __name__ == '__main__':

Mercurial > repos > earlhaminst > gstf_preparation

comparison gstf_preparation.py @ 4:284f64ad9d43 draft