Repository 'gstf_preparation'
hg clone https://toolshed.g2.bx.psu.edu/repos/earlhaminst/gstf_preparation

Changeset 9:f4acbfe8d6fe (2018-10-17)
Previous changeset 8:92f3966d5bc3 (2018-05-16) Next changeset 10:e8e75a79de59 (2019-10-31)
Commit message:
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
modified:
gstf_preparation.py
removed:
test-data/test2.fasta
b
diff -r 92f3966d5bc3 -r f4acbfe8d6fe gstf_preparation.py
--- a/gstf_preparation.py Wed May 16 20:03:57 2018 -0400
+++ b/gstf_preparation.py Wed Oct 17 07:31:29 2018 -0400
[
@@ -264,10 +264,10 @@
 
     cur.execute('SELECT species, seq_region_name FROM transcript_species WHERE transcript_id=?',
                 (transcript_id, ))
-    results = cur.fetchone()
-    if not results:
-        return None
-    return results
+    row = cur.fetchone()
+    if not row:
+        return (None, None)
+    return row
 
 
 def fetch_gene_id_for_transcript(conn, transcript_id):
@@ -275,17 +275,18 @@
 
     cur.execute('SELECT gene_id FROM transcript WHERE transcript_id=?',
                 (transcript_id, ))
-    results = cur.fetchone()
-    if not results:
+    row = cur.fetchone()
+    if not row:
         return None
-    return results[0]
+    return row[0]
 
 
-def remove_id_version(s):
+def remove_id_version(s, force=False):
     """
-    Remove the optional '.VERSION' from an Ensembl id.
+    Remove the optional '.VERSION' from an id if it's an Ensembl id or if
+    `force` is True.
     """
-    if s.startswith('ENS'):
+    if force or s.startswith('ENS'):
         return s.split('.')[0]
     else:
         return s
@@ -358,7 +359,7 @@
                     print("Line %i in file '%s': %s" % (i, filename, e), file=sys.stderr)
 
         for unimplemented_feature, nlines in unimplemented_feature_nlines_dict.items():
-            print("Skipped %d lines in file '%s': '%s' is not an implemented feature type" % (nlines, filename, unimplemented_feature), file=sys.stderr)
+            print("Skipped %d lines in GFF3 file '%s': '%s' is not an implemented feature type" % (nlines, filename, unimplemented_feature), file=sys.stderr)
 
         join_dicts(gene_dict, transcript_dict, exon_parent_dict, cds_parent_dict, five_prime_utr_parent_dict, three_prime_utr_parent_dict)
         write_gene_dict_to_db(conn, gene_dict)
@@ -367,47 +368,68 @@
         with open(json_arg) as f:
             write_gene_dict_to_db(conn, json.load(f))
 
-    if options.longestCDS:
-        gene_transcripts_dict = dict()
-        for fasta_arg in options.fasta:
-            for entry in FASTAReader_gen(fasta_arg):
-                # Extract the transcript id by removing everything after the first space and then removing the version if it is an Ensembl id
-                transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0])
+    # Read the FASTA files a first time to:
+    # - determine for each file if we need to force the removal of the version
+    #   from the transcript id
+    # - fill gene_transcripts_dict when keeping only the longest CDS per gene
+    force_remove_id_version_file_list = []
+    gene_transcripts_dict = dict()
+    for fasta_arg in options.fasta:
+        force_remove_id_version = False
+        found_gene_transcript = False
+        for entry in FASTAReader_gen(fasta_arg):
+            # Extract the transcript id by removing everything after the first space and then removing the version if needed
+            transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0], force_remove_id_version)
 
-                if len(entry.sequence) % 3 != 0:
-                    print("Transcript '%s' in file '%s' has a coding sequence length which is not multiple of 3" % (transcript_id, fasta_arg), file=sys.stderr)
-                    continue
+            if len(entry.sequence) % 3 != 0:
+                continue
 
+            gene_id = fetch_gene_id_for_transcript(conn, transcript_id)
+            if not gene_id and not found_gene_transcript:
+                # We have not found a proper gene transcript in this file yet,
+                # try to force the removal of the version from the transcript id
+                transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0], True)
                 gene_id = fetch_gene_id_for_transcript(conn, transcript_id)
-                if not gene_id:
-                    print("Transcript '%s' in file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr)
-                    continue
+                # Remember that we need to force the removal for this file
+                if gene_id:
+                    force_remove_id_version = True
+                    force_remove_id_version_file_list.append(fasta_arg)
+                    print("Forcing removal of id version in FASTA file '%s'" % fasta_arg, file=sys.stderr)
+            if not gene_id:
+                print("Transcript '%s' in FASTA file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr)
+                continue
+            if options.longestCDS:
+                found_gene_transcript = True
+            else:
+                break
 
-                if gene_id in gene_transcripts_dict:
-                    gene_transcripts_dict[gene_id].append((transcript_id, len(entry.sequence)))
-                else:
-                    gene_transcripts_dict[gene_id] = [(transcript_id, len(entry.sequence))]
+            if gene_id in gene_transcripts_dict:
+                gene_transcripts_dict[gene_id].append((transcript_id, len(entry.sequence)))
+            else:
+                gene_transcripts_dict[gene_id] = [(transcript_id, len(entry.sequence))]
 
-        # For each gene, select the transcript with the longest sequence
-        # If more than one transcripts have the same longest sequence for a gene, the
-        # first one to appear in the FASTA file is selected
+    if options.longestCDS:
+        # For each gene, select the transcript with the longest sequence.
+        # If more than one transcripts have the same longest sequence for a
+        # gene, the first one to appear in the FASTA file is selected
         selected_transcript_ids = [max(transcript_id_lengths, key=lambda _: _[1])[0] for transcript_id_lengths in gene_transcripts_dict.values()]
 
     regions = [_.strip().lower() for _ in options.regions.split(",")]
     with open(options.of, 'w') as output_fasta_file, open(options.ff, 'w') as filtered_fasta_file:
         for fasta_arg in options.fasta:
+            force_remove_id_version = fasta_arg in force_remove_id_version_file_list
             for entry in FASTAReader_gen(fasta_arg):
-                transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0])
+                transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0], force_remove_id_version)
                 if options.longestCDS and transcript_id not in selected_transcript_ids:
                     continue
 
                 if len(entry.sequence) % 3 != 0:
-                    print("Transcript '%s' in file '%s' has a coding sequence length which is not multiple of 3" % (transcript_id, fasta_arg), file=sys.stderr)
+                    print("Transcript '%s' in FASTA file '%s' has a coding sequence length which is not multiple of 3" % (transcript_id, fasta_arg), file=sys.stderr)
                     continue
 
                 species_for_transcript, seq_region_for_transcript = fetch_species_and_seq_region_for_transcript(conn, transcript_id)
                 if not species_for_transcript:
-                    print("Transcript '%s' in file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr)
+                    print("Transcript '%s' in FASTA file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr)
                     continue
 
                 if options.headers:
b
diff -r 92f3966d5bc3 -r f4acbfe8d6fe test-data/test2.fasta
--- a/test-data/test2.fasta Wed May 16 20:03:57 2018 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
b'@@ -1,1265 +0,0 @@\n->ENST00000338702_homosapiens\n-ATGGAGAATCAAGAGAAGGCGAGTATCGCGGGCCACATGTTCGACGTAGTCGTGATCGGA\n-GGTGGCATTTCAGGACTATCTGCTGCCAAACTCTTGACTGAATATGGCGTTAGTGTTTTG\n-GTTTTAGAAGCTCGGGACAGGGTTGGAGGAAGAACATATACTATAAGGAATGAGCATGTT\n-GATTACGTAGATGTTGGTGGAGCTTATGTGGGACCAACCCAAAACAGAATCTTACGCTTG\n-TCTAAGGAGCTGGGCATAGAGACTTACAAAGTGAATGTCAGTGAGCGTCTCGTTCAATAT\n-GTCAAGGGGAAAACATATCCATTTCGGGGCGCCTTTCCACCAGTATGGAATCCCATTGCA\n-TATTTGGATTACAATAATCTGTGGAGGACAATAGATAACATGGGGAAGGAGATTCCAACT\n-GATGCACCCTGGGAGGCTCAACATGCTGACAAATGGGACAAAATGACCATGAAAGAGCTC\n-ATTGACAAAATCTGCTGGACAAAGACTGCTAGGCGGTTTGCTTATCTTTTTGTGAATATC\n-AATGTGACCTCTGAGCCTCACGAAGTGTCTGCCCTGTGGTTCTTGTGGTATGTGAAGCAG\n-TGCGGGGGCACCACTCGGATATTCTCTGTCACCAATGGTGGCCAGGAACGGAAGTTTGTA\n-GGTGGATCTGGTCAAGTGAGCGAACGGATAATGGACCTCCTCGGAGACCAAGTGAAGCTG\n-AACCATCCTGTCACTCACGTTGACCAGTCAAGTGACAACATCATCATAGAGACGCTGAAC\n-CATGAACATTATGAGTGCAAATACGTAATTAATGCGATCCCTCCGACCTTGACTGCCAAG\n-ATTCACTTCAGACCAGAGCTTCCAGCAGAGAGAAACCAGTTAATTCAGCGGCTTCCAATG\n-GGAGCTGTCATTAAGTGCATGATGTATTACAAGGAGGCCTTCTGGAAGAAGAAGGATTAC\n-TGTGGCTGCATGATCATTGAAGATGAAGATGCTCCAATTTCAATAACCTTGGATGACACC\n-AAGCCAGATGGGTCACTGCCTGCCATCATGGGCTTCATTCTTGCCCGGAAAGCTGATCGA\n-CTTGCTAAGCTACATAAGGAAATAAGGAAGAAGAAAATCTGTGAGCTCTATGCCAAAGTG\n-CTGGGATCCCAAGAAGCTTTACATCCAGTGCATTATGAAGAGAAGAACTGGTGTGAGGAG\n-CAGTACTCTGGGGGCTGCTACACGGCCTACTTCCCTCCTGGGATCATGACTCAATATGGA\n-AGGGTGATTCGTCAACCCGTGGGCAGGATTTTCTTTGCGGGCACAGAGACTGCCACAAAG\n-TGGAGCGGCTACATGGAAGGGGCAGTTGAGGCTGGAGAACGAGCAGCTAGGGAGGTCTTA\n-AATGGTCTCGGGAAGGTGACCGAGAAAGATATCTGGGTACAAGAACCTGAATCAAAGGAC\n-GTTCCAGCGGTAGAAATCACCCACACCTTCTGGGAAAGGAACCTGCCCTCTGTTTCTGGC\n-CTGCTGAAGATCATTGGATTTTCCACATCAGTAACTGCCCTGGGGTTTGTGCTGTACAAA\n-TACAAGCTCCTGCCACGGTCTTGA\n->ENST00000542639_homosapiens\n-ATGGGGAAGGAGATTCCAACTGATGCACCCTGGGAGGCTCAACATGCTGACAAATGGGAC\n-AAAATGACCATGAAAGAGCTCATTGACAAAATCTGCTGGACAAAGACTGCTAGGCGGTTT\n-GCTTATCTTTTTGTGAATATCAATGTGACCTCTGAGCCTCACGAAGTGTCTGCCCTGTGG\n-TTCTTGTGGTATGTGAAGCAGTGCGGGGGCACCACTCGGATATTCTCTGTCACCAATGGT\n-GGCCAGGAACGGAAGTTTGTAGGTGGATCTGGTCAAGTGAGCGAACGGATAATGGACCTC\n-CTCGGAGACCAAGTGAAGCTGAACCATCCTGTCACTCACGTTGACCAGTCAAGTGACAAC\n-ATCATCATAGAGACGCTGAACCATGAACATTATGAGTGCAAATACGTAATTAATGCGATC\n-CCTCCGACCTTGACTGCCAAGATTCACTTCAGACCAGAGCTTCCAGCAGAGAGAAACCAG\n-TTAATTCAGCGGCTTCCAATGGGAGCTGTCATTAAGTGCATGATGTATTACAAGGAGGCC\n-TTCTGGAAGAAGAAGGATTACTGTGGCTGCATGATCATTGAAGATGAAGATGCTCCAATT\n-TCAATAACCTTGGATGACACCAAGCCAGATGGGTCACTGCCTGCCATCATGGGCTTCATT\n-CTTGCCCGGAAAGCTGATCGACTTGCTAAGCTACATAAGGAAATAAGGAAGAAGAAAATC\n-TGTGAGCTCTATGCCAAAGTGCTGGGATCCCAAGAAGCTTTACATCCAGTGCATTATGAA\n-GAGAAGAACTGGTGTGAGGAGCAGTACTCTGGGGGCTGCTACACGGCCTACTTCCCTCCT\n-GGGATCATGACTCAATATGGAAGGGTGATTCGTCAACCCGTGGGCAGGATTTTCTTTGCG\n-GGCACAGAGACTGCCACAAAGTGGAGCGGCTACATGGAAGGGGCAGTTGAGGCTGGAGAA\n-CGAGCAGCTAGGGAGGTCTTAAATGGTCTCGGGAAGGTGACCGAGAAAGATATCTGGGTA\n-CAAGAACCTGAATCAAAGGACGTTCCAGCGGTAGAAATCACCCACACCTTCTGGGAAAGG\n-AACCTGCCCTCTGTTTCTGGCCTGCTGAAGATCATTGGATTTTCCACATCAGTAACTGCC\n-CTGGGGTTTGTGCTGTACAAATACAAGCTCCTGCCACGGTCTTGA\n->ENSPTRT00000040520_pantroglodytes\n-ATGGAGAATCAAGAGAAGGCGAGTATCGCGGGCCACATGTTCGACGTAGTCGTGATCGGA\n-GGTGGCATTTCAGGACTATCTGCTGCCAAACTCTTGACTGAATATGGCGTTAGTGTTTTA\n-GTTTTAGAAGCTCGGGACAGGGTTGGAGGAAGAACATATACTATAAGGAATGAGCATGTT\n-GATTACGTAGATGTTGGTGGAGCTTATGTGGGACCAACCCAAAACAGAATCTTACGCTTG\n-TCTAAGGAGCTGGGCATAGAGACTTACAAAGTGAATGTCAGTGAGCGTCTCGTTCAATAT\n-GTCAAGGGGAAAACATATCCATTTCGGGGCGCCTTTCCACCAGTATGGAATCCCATTGCA\n-TATTTGGATTACAATAATCTGTGGCGGACAATAGATAACATGGGGAAGGAGATTCCAAAT\n-GATGCACCCTGGGAGGCTCAACATGCTGACGAATGGGACAAAATGACCATGAAAGAGCTC\n-ATTGACAAAATCTGCTGGACAAAGACTGCTAGGCGGTTTGCTTATCTTTTTGTGAATATC\n-AATGTGACCTCTGAGCCTCACGAAGTGTCTGCCCTGTGGTTCTTGTGGTATGTGAAGCAG\n-TGCGGGGGCACCACTCGGATATTCTCTGTCACCAATGGCGGCCAGGAACGGAAGTTTGTA\n-GGTGGATCTGGTCAAGTGAGCGAACGGATAATGGACCTCCTTGGAGACCAAGTGAAGCTG\n-AACCATCCTGTCACTCATGTTGACCAGTCAAGTGACAACATCATCATAGAGACACTGAAC\n-CATGAACATTATGAGTGCAAATACGTAATTAATGCGATCCCTCCGACCTTGACTGCCAAG\n-ATTCACTTCAGACCAGAGCTTCCAGCAGAGAGAAACCAGTTAATTCAGCGTCTTCCAATG\n-GGAGCTATCATTAAGTGCATGATGTATTACAAGGAGGCCTTCTGGAAGAAGAAGGATTAC\n-TGTGGCTGCATGATCATTGAAGATGAAG'..b'GCACAATCAG\n-AGTGAGTATGACGACTCGGCCAGCGAGTGCTGCTCATGTCCTAAGACTGACTCTCAGATC\n-CTGAAGGAGCTGGAGGAGTCTTCATTCAGGAAGACCTTCGAGGATTACCTGCACAACGTG\n-GTTTTTGTTCCCAGAAAAACCTCTTCAGGCAATGGTGCTGAGGACACTAGGCCATCCCGA\n-AAGCGAAGATCCCTTGAAGAGGTGGGCAATGTGACAGCCACTACACCCACACTTCCAGAT\n-TTTCCCAACATCTCCTCCACCATCGCGCCCACAAGCCACGAAGAGCACAGACCATTTGAG\n-AAAGTAGTAAACAAGGAGTCACTTGTCATCTCTGGCCTGAGACACTTCACTGGGTACCGC\n-ATTGAGCTGCAGGCATGCAATCAGGACTCCCCAGAAGAGAGGTGCAGCGTGGCTGCCTAC\n-GTCAGTGCCCGGACCATGCCTGAAGCTAAGGCAGATGACATCGTTGGCCCTGTGACCCAT\n-GAAATCTTTGAGAACAATGTTGTACACTTAATGTGGCAAGAGCCAAAGGAACCTAATGGT\n-CTGATTGTGCTATATGAAGTGAGCTATCGGCGATATGGTGATGAGGAGCTGCACCTCTGT\n-GTCTCCCGGAAGCATTTTGCCCTGGAGCGGGGCTGCAGGCTTCGAGGGCTCTCTCCAGGA\n-AACTACAGTGTTCGAGTCCGGGCTACCTCTCTGGCAGGAAATGGCTCCTGGACAGAACCC\n-ACCTATTTTTATGTGACTGATTATTTAGATGTCCCATCAAATATTGCCAAAATTATCATC\n-GGGCCCCTCATCTTCGTCTTCCTCTTCAGTGTCGTGATCGGAAGTATTTATCTATTCTTG\n-AGGAAGAGGCAGCCAGATGGGCCAATGGGACCACTGTACGCTTCTTCAAACCCAGAGTAC\n-CTCAGTGCCAGTGATGTCTTTCCATCTTCCGTATACGTTCCGGATGAGTGGGAGGTACCT\n-CGAGAGAAGATCACCCTCCTCCGAGAGCTGGGGCAGGGATCCTTCGGTATGGTGTACGAA\n-GGCAATGCCAAGGATATCATCAAGGGTGAGGTAGAGACCCGTGTTGCGGTGAAGACGGTC\n-AATGAGTCAGCCAGTCTTCGAGAACGGATCGAGTTCCTCAATGAGGCATCAGTCATGAAG\n-GGCTTCACCTGTCATCACGTGGTCCGCCTTCTTGGGGTGGTGTCCAAAGGCCAGCCCACA\n-TTGGTAGTGATGGAACTGATGGCTCATGGAGACCTGAAAAGTCACCTCCGTTCTCTGCGG\n-CCCGATGCTGAGAACAACCCAGGCCGTCCTCCCCCTACCTTGCAAGAAATGATTCAGATG\n-ACAGCAGAAATTGCCGATGGCATGGCATACTTGAACGCCAAGAAGTTTGTGCACCGGGAC\n-CTGGCAGCTCGGAACTGCATGGTTGCCCATGATTTTACTGTCAAAATCGGAGACTTTGGA\n-ATGACGAGAGACATCTACGAGACAGATTACTATCGGAAAGGGGGCAAGGGGTTGCTGCCC\n-GTGAGGTGGATGTCACCCGAGTCCCTGAAGGACGGAGTCTTCACTGCTTCTTCCGACATG\n-TGGTCCTTTGGGGTGGTCCTTTGGGAAATCACCAGCCTGGCTGAGCAACCTTACCAAGGC\n-CTGTCTAATGAACAGGTGTTGAAATTTGTCATGGATGGAGGCTATCTGGATCCCCCTGAT\n-AACTGTCCAGAGAGACTCACTGACCTGATGCGCATGTGCTGGCAGTTCAACCCCAAGATG\n-AGGCCGACCTTCCTGGAAATCGTCAACCTGCTCAAGGACGACCTCCACCCCAGCTTTCCG\n-GAAGTTTCCTTCTTCTACAGCGAGGAGAACAAGGCTCCCGAGAGTGAAGAGCTGGAGATG\n-GAGTTCGAGGACATGGAGAATGTCCCCTTGGATCGTTCCTCTCACTGTCAGAGAGAAGAG\n-GCTGGATGCCGGGAGGGAGGGTCCTCTCTGAGCATCAAACGGACCTATGATGAACACATC\n-CCCTACACCCACATGAACGGGGGCAAGAAGAATGGGCGGGTCCTCACCCTGCCGAGGTCG\n-AACCCTTCCTAA\n->ENSSSCT00000014817_susscrofa\n-GTGTGCCCAGGGATGGATATCCGGAATAACCTTACACGGCTGCACGAGTTGGCCAACTGC\n-TCGGTCATCGAAGGACATTTGCAGATCCTGTTGATGTTCAAAACGCGGCCCGAGGATTTC\n-CGAGACCTCAGTTTCCCCAAACTCATCATGATCACTGATTACTTGCTGCTCTTCCGGGTC\n-TACGGGCTGGAGAGCCTGAAGGACCTGTTCCCCAACCTCACCGTCATCCGGGGCTCACGC\n-CTCTTCTTTAACTATGCGCTGGTCATCTTTGAGATGGTTCACTTGAAGGAGCTTGGCCTC\n-TACAATTTGATGAACATCACCAGGGGTGCTGTCCGCATCGAGAAGAACAATGAGCTCTGC\n-TACCTGGCGACCATTGACTGGTCGCGCATCCTGGACTCTGTGGAGGATAATTACATTGTG\n-CTGAACAAAGACGACAACGAGGAGTGTGGGGACATTTGCCCAGGCACTGCGAAGGGCAAG\n-ACCAATTGCCCTGCCACCGTCATCAATGGGCAATTTGTCGAGCGGTGTTGGACGCACAGT\n-CACTGCCAGAAAGTGTGCCCGACCATCTGTAAGTCGCACGGCTGCACTGCTGAGGGCCTC\n-TGCTGTCACAGCGAGTGTTTGGGCAACTGCTCTGAGCCAGACGACCCCACCAAGTGCGTG\n-GCCTGCCGCAACTTCTACCTGGACGGCAGATGCGTGGAGACCTGCCCGCCCCCCTACTAC\n-CACTTCCAAGACTGGCGCTGCGTGAACTTCAGCTTCTGCCAGGACCTGCACAACAAATGC\n-AAGAACTCAAGGAGGCAGGGCTGCCACCAGTACGTCATTCACAACAACAAGTGTATCCCT\n-GAGTGCCCCTCAGGGTACACGATGAATTCCAGCAACTTGATGTGCACTCCGTGCCTAGGC\n-CCCTGTCCCAAAGTGTGTCACCTCCTGGAAGGCGAGAAGACCATCGACTCAGTGACATCC\n-GCCCAGGAGCTCCGAGGCTGCACCATTATCAACGGGAGCCTAATCATCAACATTCGAGGA\n-GGCAACAACCTGGCAGCCGAACTAGAGGCCAACCTTGGACTCATTGAGGAGATTTCAGGG\n-TACCTGAAAATCCGCCGATCCTATGCCCTCGTGTCACTTTCCTTCTTCCGGAAGTTGCGT\n-CTGATCCGAGGGGAGACGTTGGAAATTGGGAACTATTCTTTCTATGCCTTGGACAACCAG\n-AACCTAAGGCAACTGTGGGACTGGAGCAAACACAACCTCACCATCACTCAGGGGAAACTC\n-TTCTTCCATTATAATCCCAAACTCTGCTTGTCGGAAATTCACAAGATGGAGGAAGTTTCT\n-GGAACCAAGGGGCGCCAGGAGAGAAATGATATTGCCCTGAAGACCAATGGGGACCAGGCG\n-TCCTGTGAAAATGAGTTACTTAAATTTTCTTACATTCGGACATCTTATGACAAGATCTTG\n-CTGAAGTGGGAGCCGTATTGGCCCCCCGACTTCCGAGACCTCCTGGGGTTCATGCTCTTC\n-TACAAAGAGGCCCCTTATCAGAACGTGACGGAGTTTGACGGGCAGGATGCGTGTGGCTCC\n-AACAGCTGGACGGTGGTGGACATTGACCCGCCTACGAGGTCCAATGACCCCAAGTCCCAG\n-AACCATCCTGGGTGGCTGATGCGTGGTCTCAAGCCCTGGACCCAGTATGCCATCTTTGTC\n-AAGACTTTGGTCACCTTTTCTGATGAACGACGCACCTATGGAGCCAAGAGTGACATCATC\n-TACGTCCAGACAGATGCCACAAGTAAGCATGTC\n-\n'