Mercurial > repos > bgruening > augustus_training

diff extract_features.py @ 5:7be22100e5e1 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/augustus commit bba7f5df059fcbeb06e89cf689e9a04d4f22cb76"
author: iuc
date: Thu, 15 Jul 2021 17:16:33 +0000
parents: 86c89c3bd99d
--- a/extract_features.py	Fri Dec 20 14:09:14 2019 -0500
+++ b/extract_features.py	Thu Jul 15 17:16:33 2021 +0000
@@ -5,76 +5,88 @@
 import textwrap
 
 
-def main( args ):
+def main(args):
     """
-    Extract the protein and coding section from an augustus gff, gtf file
-    Example file:
-HS04636	AUGUSTUS	stop_codon	6901	6903	.	+	0	Parent=g1.t1
-HS04636	AUGUSTUS	transcription_end_site	8857	8857	.	+	.	Parent=g1.t1
-# protein sequence = [MLARALLLCAVLALSHTANPCCSHPCQNRGVCMSVGFDQYKCDCTRTGFYGENCSTPEFLTRIKLFLKPTPNTVHYIL
-# THFKGFWNVVNNIPFLRNAIMSYVLTSRSHLIDSPPTYNADYGYKSWEAFSNLSYYTRALPPVPDDCPTPLGVKGKKQLPDSNEIVEKLLLRRKFIPD
-# PQGSNMMFAFFAQHFTHQFFKTDHKRGPAFTNGLGHGVDLNHIYGETLARQRKLRLFKDGKMKYQIIDGEMYPPTVKDTQAEMIYPPQVPEHLRFAVG
-# QEVFGLVPGLMMYATIWLREHNRVCDVLKQEHPEWGDEQLFQTSRLILIGETIKIVIEDYVQHLSGYHFKLKFDPELLFNKQFQYQNRIAAEFNTLYH
-# WHPLLPDTFQIHDQKYNYQQFIYNNSILLEHGITQFVESFTRQIAGRVAGGRNVPPAVQKVSQASIDQSRQMKYQSFNEYRKRFMLKPYESFEELTGE
-# KEMSAELEALYGDIDAVELYPALLVEKPRPDAIFGETMVEVGAPFSLKGLMGNVICSPAYWKPSTFGGEVGFQIINTASIQSLICNNVKGCPFTSFSV
-# PDPELIKTVTINASSSRSGLDDINPTVLLKERSTEL]
-# end gene g1
-###
-#
-# ----- prediction on sequence number 2 (length = 2344, name = HS08198) -----
-#
-# Predicted genes for sequence number 2 on both strands
-# start gene g2
-HS08198	AUGUSTUS	gene	86	2344	1	+	.	ID=g2
-HS08198	AUGUSTUS	transcript	86	2344	.	+	.	ID=g2.t1;Parent=g2
-HS08198	AUGUSTUS	transcription_start_site	86	86	.	+	.	Parent=g2.t1
-HS08198	AUGUSTUS	exon	86	582	.	+	.	Parent=g2.t1
-HS08198	AUGUSTUS	start_codon	445	447	.	+	0	Parent=g2.t1
+        Extract the protein and coding section from an augustus gff, gtf file
+        Example file:
+    HS04636	AUGUSTUS	stop_codon	6901	6903	.	+	0	Parent=g1.t1
+    HS04636	AUGUSTUS	transcription_end_site	8857	8857	.	+	.	Parent=g1.t1
+    # protein sequence = [MLARALLLCAVLALSHTANPCCSHPCQNRGVCMSVGFDQYKCDCTRTGFYGENCSTPEFLTRIKLFLKPTPNTVHYIL
+    # THFKGFWNVVNNIPFLRNAIMSYVLTSRSHLIDSPPTYNADYGYKSWEAFSNLSYYTRALPPVPDDCPTPLGVKGKKQLPDSNEIVEKLLLRRKFIPD
+    # PQGSNMMFAFFAQHFTHQFFKTDHKRGPAFTNGLGHGVDLNHIYGETLARQRKLRLFKDGKMKYQIIDGEMYPPTVKDTQAEMIYPPQVPEHLRFAVG
+    # QEVFGLVPGLMMYATIWLREHNRVCDVLKQEHPEWGDEQLFQTSRLILIGETIKIVIEDYVQHLSGYHFKLKFDPELLFNKQFQYQNRIAAEFNTLYH
+    # WHPLLPDTFQIHDQKYNYQQFIYNNSILLEHGITQFVESFTRQIAGRVAGGRNVPPAVQKVSQASIDQSRQMKYQSFNEYRKRFMLKPYESFEELTGE
+    # KEMSAELEALYGDIDAVELYPALLVEKPRPDAIFGETMVEVGAPFSLKGLMGNVICSPAYWKPSTFGGEVGFQIINTASIQSLICNNVKGCPFTSFSV
+    # PDPELIKTVTINASSSRSGLDDINPTVLLKERSTEL]
+    # end gene g1
+    ###
+    #
+    # ----- prediction on sequence number 2 (length = 2344, name = HS08198) -----
+    #
+    # Predicted genes for sequence number 2 on both strands
+    # start gene g2
+    HS08198	AUGUSTUS	gene	86	2344	1	+	.	ID=g2
+    HS08198	AUGUSTUS	transcript	86	2344	.	+	.	ID=g2.t1;Parent=g2
+    HS08198	AUGUSTUS	transcription_start_site	86	86	.	+	.	Parent=g2.t1
+    HS08198	AUGUSTUS	exon	86	582	.	+	.	Parent=g2.t1
+    HS08198	AUGUSTUS	start_codon	445	447	.	+	0	Parent=g2.t1
     """
-    protein_seq = ''
-    coding_seq = ''
+    protein_seq = ""
+    coding_seq = ""
     if args.protein:
-        po = open( args.protein, 'w+' )
+        po = open(args.protein, "w+")
     if args.codingseq:
-        co = open( args.codingseq, 'w+' )
+        co = open(args.codingseq, "w+")
 
     for line in sys.stdin:
         # protein- and coding-sequence are stored as comments
-        if line.startswith('#'):
+        if line.startswith("#"):
             line = line[2:].strip()
-            if line.startswith('start gene'):
+            if line.startswith("start gene"):
                 gene_name = line[11:].strip()
 
             if protein_seq:
-                if line.endswith(']'):
+                if line.endswith("]"):
                     protein_seq += line[:-1]
-                    po.write( '>%s\n%s\n' % (gene_name, '\n'.join( textwrap.wrap( protein_seq, 80 ) ) ) )
-                    protein_seq = ''
+                    po.write(
+                        ">%s\n%s\n"
+                        % (gene_name, "\n".join(textwrap.wrap(protein_seq, 80)))
+                    )
+                    protein_seq = ""
                 else:
                     protein_seq += line
 
             if coding_seq:
-                if line.endswith(']'):
+                if line.endswith("]"):
                     coding_seq += line[:-1]
-                    co.write( '>%s\n%s\n' % (gene_name, '\n'.join( textwrap.wrap( coding_seq, 80 ) ) ) )
-                    coding_seq = ''
+                    co.write(
+                        ">%s\n%s\n"
+                        % (gene_name, "\n".join(textwrap.wrap(coding_seq, 80)))
+                    )
+                    coding_seq = ""
                 else:
                     coding_seq += line
 
-            if args.protein and line.startswith('protein sequence = ['):
-                if line.endswith(']'):
+            if args.protein and line.startswith("protein sequence = ["):
+                if line.endswith("]"):
                     protein_seq = line[20:-1]
-                    po.write( '>%s\n%s\n' % (gene_name, '\n'.join( textwrap.wrap( protein_seq, 80 ) ) ) )
-                    protein_seq = ''
+                    po.write(
+                        ">%s\n%s\n"
+                        % (gene_name, "\n".join(textwrap.wrap(protein_seq, 80)))
+                    )
+                    protein_seq = ""
                 else:
                     line = line[20:]
                     protein_seq = line
 
-            if args.codingseq and line.startswith('coding sequence = ['):
-                if line.endswith(']'):
+            if args.codingseq and line.startswith("coding sequence = ["):
+                if line.endswith("]"):
                     coding_seq = line[19:-1]
-                    co.write( '>%s\n%s\n' % (gene_name, '\n'.join( textwrap.wrap( coding_seq, 80 ) ) ) )
-                    coding_seq = ''
+                    co.write(
+                        ">%s\n%s\n"
+                        % (gene_name, "\n".join(textwrap.wrap(coding_seq, 80)))
+                    )
+                    coding_seq = ""
                 else:
                     line = line[19:]
                     coding_seq = line
@@ -85,10 +97,10 @@
         po.close()
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('-p', '--protein', help='Path to the protein file.')
-    parser.add_argument('-c', '--codingseq', help='Path to the coding file.')
+    parser.add_argument("-p", "--protein", help="Path to the protein file.")
+    parser.add_argument("-c", "--codingseq", help="Path to the coding file.")
 
     args = parser.parse_args()
-    main( args )
+    main(args)
author	iuc
date	Thu, 15 Jul 2021 17:16:33 +0000
parents	86c89c3bd99d
children