Mercurial > repos > bgruening > augustus_training
diff extract_features.py @ 5:7be22100e5e1 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/augustus commit bba7f5df059fcbeb06e89cf689e9a04d4f22cb76"
author | iuc |
---|---|
date | Thu, 15 Jul 2021 17:16:33 +0000 |
parents | 86c89c3bd99d |
children |
line wrap: on
line diff
--- a/extract_features.py Fri Dec 20 14:09:14 2019 -0500 +++ b/extract_features.py Thu Jul 15 17:16:33 2021 +0000 @@ -5,76 +5,88 @@ import textwrap -def main( args ): +def main(args): """ - Extract the protein and coding section from an augustus gff, gtf file - Example file: -HS04636 AUGUSTUS stop_codon 6901 6903 . + 0 Parent=g1.t1 -HS04636 AUGUSTUS transcription_end_site 8857 8857 . + . Parent=g1.t1 -# protein sequence = [MLARALLLCAVLALSHTANPCCSHPCQNRGVCMSVGFDQYKCDCTRTGFYGENCSTPEFLTRIKLFLKPTPNTVHYIL -# THFKGFWNVVNNIPFLRNAIMSYVLTSRSHLIDSPPTYNADYGYKSWEAFSNLSYYTRALPPVPDDCPTPLGVKGKKQLPDSNEIVEKLLLRRKFIPD -# PQGSNMMFAFFAQHFTHQFFKTDHKRGPAFTNGLGHGVDLNHIYGETLARQRKLRLFKDGKMKYQIIDGEMYPPTVKDTQAEMIYPPQVPEHLRFAVG -# QEVFGLVPGLMMYATIWLREHNRVCDVLKQEHPEWGDEQLFQTSRLILIGETIKIVIEDYVQHLSGYHFKLKFDPELLFNKQFQYQNRIAAEFNTLYH -# WHPLLPDTFQIHDQKYNYQQFIYNNSILLEHGITQFVESFTRQIAGRVAGGRNVPPAVQKVSQASIDQSRQMKYQSFNEYRKRFMLKPYESFEELTGE -# KEMSAELEALYGDIDAVELYPALLVEKPRPDAIFGETMVEVGAPFSLKGLMGNVICSPAYWKPSTFGGEVGFQIINTASIQSLICNNVKGCPFTSFSV -# PDPELIKTVTINASSSRSGLDDINPTVLLKERSTEL] -# end gene g1 -### -# -# ----- prediction on sequence number 2 (length = 2344, name = HS08198) ----- -# -# Predicted genes for sequence number 2 on both strands -# start gene g2 -HS08198 AUGUSTUS gene 86 2344 1 + . ID=g2 -HS08198 AUGUSTUS transcript 86 2344 . + . ID=g2.t1;Parent=g2 -HS08198 AUGUSTUS transcription_start_site 86 86 . + . Parent=g2.t1 -HS08198 AUGUSTUS exon 86 582 . + . Parent=g2.t1 -HS08198 AUGUSTUS start_codon 445 447 . + 0 Parent=g2.t1 + Extract the protein and coding section from an augustus gff, gtf file + Example file: + HS04636 AUGUSTUS stop_codon 6901 6903 . + 0 Parent=g1.t1 + HS04636 AUGUSTUS transcription_end_site 8857 8857 . + . Parent=g1.t1 + # protein sequence = [MLARALLLCAVLALSHTANPCCSHPCQNRGVCMSVGFDQYKCDCTRTGFYGENCSTPEFLTRIKLFLKPTPNTVHYIL + # THFKGFWNVVNNIPFLRNAIMSYVLTSRSHLIDSPPTYNADYGYKSWEAFSNLSYYTRALPPVPDDCPTPLGVKGKKQLPDSNEIVEKLLLRRKFIPD + # PQGSNMMFAFFAQHFTHQFFKTDHKRGPAFTNGLGHGVDLNHIYGETLARQRKLRLFKDGKMKYQIIDGEMYPPTVKDTQAEMIYPPQVPEHLRFAVG + # QEVFGLVPGLMMYATIWLREHNRVCDVLKQEHPEWGDEQLFQTSRLILIGETIKIVIEDYVQHLSGYHFKLKFDPELLFNKQFQYQNRIAAEFNTLYH + # WHPLLPDTFQIHDQKYNYQQFIYNNSILLEHGITQFVESFTRQIAGRVAGGRNVPPAVQKVSQASIDQSRQMKYQSFNEYRKRFMLKPYESFEELTGE + # KEMSAELEALYGDIDAVELYPALLVEKPRPDAIFGETMVEVGAPFSLKGLMGNVICSPAYWKPSTFGGEVGFQIINTASIQSLICNNVKGCPFTSFSV + # PDPELIKTVTINASSSRSGLDDINPTVLLKERSTEL] + # end gene g1 + ### + # + # ----- prediction on sequence number 2 (length = 2344, name = HS08198) ----- + # + # Predicted genes for sequence number 2 on both strands + # start gene g2 + HS08198 AUGUSTUS gene 86 2344 1 + . ID=g2 + HS08198 AUGUSTUS transcript 86 2344 . + . ID=g2.t1;Parent=g2 + HS08198 AUGUSTUS transcription_start_site 86 86 . + . Parent=g2.t1 + HS08198 AUGUSTUS exon 86 582 . + . Parent=g2.t1 + HS08198 AUGUSTUS start_codon 445 447 . + 0 Parent=g2.t1 """ - protein_seq = '' - coding_seq = '' + protein_seq = "" + coding_seq = "" if args.protein: - po = open( args.protein, 'w+' ) + po = open(args.protein, "w+") if args.codingseq: - co = open( args.codingseq, 'w+' ) + co = open(args.codingseq, "w+") for line in sys.stdin: # protein- and coding-sequence are stored as comments - if line.startswith('#'): + if line.startswith("#"): line = line[2:].strip() - if line.startswith('start gene'): + if line.startswith("start gene"): gene_name = line[11:].strip() if protein_seq: - if line.endswith(']'): + if line.endswith("]"): protein_seq += line[:-1] - po.write( '>%s\n%s\n' % (gene_name, '\n'.join( textwrap.wrap( protein_seq, 80 ) ) ) ) - protein_seq = '' + po.write( + ">%s\n%s\n" + % (gene_name, "\n".join(textwrap.wrap(protein_seq, 80))) + ) + protein_seq = "" else: protein_seq += line if coding_seq: - if line.endswith(']'): + if line.endswith("]"): coding_seq += line[:-1] - co.write( '>%s\n%s\n' % (gene_name, '\n'.join( textwrap.wrap( coding_seq, 80 ) ) ) ) - coding_seq = '' + co.write( + ">%s\n%s\n" + % (gene_name, "\n".join(textwrap.wrap(coding_seq, 80))) + ) + coding_seq = "" else: coding_seq += line - if args.protein and line.startswith('protein sequence = ['): - if line.endswith(']'): + if args.protein and line.startswith("protein sequence = ["): + if line.endswith("]"): protein_seq = line[20:-1] - po.write( '>%s\n%s\n' % (gene_name, '\n'.join( textwrap.wrap( protein_seq, 80 ) ) ) ) - protein_seq = '' + po.write( + ">%s\n%s\n" + % (gene_name, "\n".join(textwrap.wrap(protein_seq, 80))) + ) + protein_seq = "" else: line = line[20:] protein_seq = line - if args.codingseq and line.startswith('coding sequence = ['): - if line.endswith(']'): + if args.codingseq and line.startswith("coding sequence = ["): + if line.endswith("]"): coding_seq = line[19:-1] - co.write( '>%s\n%s\n' % (gene_name, '\n'.join( textwrap.wrap( coding_seq, 80 ) ) ) ) - coding_seq = '' + co.write( + ">%s\n%s\n" + % (gene_name, "\n".join(textwrap.wrap(coding_seq, 80))) + ) + coding_seq = "" else: line = line[19:] coding_seq = line @@ -85,10 +97,10 @@ po.close() -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('-p', '--protein', help='Path to the protein file.') - parser.add_argument('-c', '--codingseq', help='Path to the coding file.') + parser.add_argument("-p", "--protein", help="Path to the protein file.") + parser.add_argument("-c", "--codingseq", help="Path to the coding file.") args = parser.parse_args() - main( args ) + main(args)