# HG changeset patch # User pjbriggs # Date 1521628781 14400 # Node ID b67ea47730d324cdcabdb456c34c8014559c4037 Version 1.0.1. diff -r 000000000000 -r b67ea47730d3 CountUniqueIDs.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CountUniqueIDs.xml Wed Mar 21 06:39:41 2018 -0400 @@ -0,0 +1,45 @@ + + + Gives the non-redundant count of sequences + + motif_tools_macros.xml + + + + + + + + + + + +.. class:: infomark + +**What it does** + +This tool counts the number non-redundant sequence identifiers (seqname) in a GFF file. The tool was originally written to read a GFF file containing set of motif matches and report the number of sequences that contain one or more instances of the scanned motif. + +---- + +.. class:: infomark + +**Options** + +A GFF formated file is required. + +---- + +.. class:: infomark + +**Credits** + +This Galaxy tool has been developed within the Bioinformatics Core Facility at the University of Manchester. It runs the CountUniqueIDs.pl Perl script that was written by Ian Donaldson. + +Please kindly acknowledge both this Galaxy tool and CountUniqueIDs.pl if you use it. + + + + diff -r 000000000000 -r b67ea47730d3 README.rst --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.rst Wed Mar 21 06:39:41 2018 -0400 @@ -0,0 +1,77 @@ +motif_tools +=========== + +Galaxy tools for various motif-finding utilities developed by Ian Donaldson. + +There are five tools available: + + * **IUPAC scan and output each match** Returns all matches to a given IUPAC in + GFF format + + * **IUPAC scan and output matches per seq** Counts the matches to a given IUPAC + + * **Count unique seq in GFF** Gives the non-redundant count of sequences + + * **TFBScluster two TFBS** Identifies clusters of two TFBS + + * **TFBScluster three TFBS** Identifies clusters of three TFBS + +Automated installation +====================== + +Installation via the Galaxy Tool Shed will take of installing the tools +and the underlying dependencies. + +Manual Installation +=================== + +To add these to Galaxy put the following lines in tool_conf.xml for each: +tool that you want:: + + + + + + + +The tools also require Perl and ``Bioperl`` to be installed. + +History +======= + +========== ===================================================================== += +Version Changes +---------- ---------------------------------------------------------------------- +- 1.0.1 Updates to use conda dependency resolution and tidy up XML +- 1.0.0 Initial version +========== ===================================================================== += + +Developers +========== + +This tool is developed on the following GitHub repository: +https://github.com/fls-bioinformatics-core/galaxy-tools/tree/master/tools/macs21 + + +Licence (MIT) +============= + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff -r 000000000000 -r b67ea47730d3 Scan_IUPAC_output_each_match.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Scan_IUPAC_output_each_match.xml Wed Mar 21 06:39:41 2018 -0400 @@ -0,0 +1,96 @@ + + + Returns all matches to a given IUPAC in GFF format + + motif_tools_macros.xml + + + + + + + + + + + + + + + + + + + + + + + + + + +.. class:: infomark + +**What it does** + +This tool will find all matches to a DNA pattern in the input DNA sequence, represented by an IUPAC string. The matches are non-overlapping, so searching with 'TTTT' in 'TTTTTTTT' will find two hits to the IUPAC. The output is in GFF format and the last 'attribute' field can be specified using the 'Label' option. + +IUPAC = Nucleotide(s): + +A = A + +C = C + +G = G + +T = T + +M = A/C + +R = A/G + +W = A/T + +S = C/G + +Y = C/T + +K = G/T + +V = A/C/G + +H = A/C/T + +D = A/G/T + +B = C/G/T + +N = A/C/G/T + +---- + +.. class:: infomark + +**Options** + +'IUPAC string' - can be entered as upper- or lower-case as the tool will force them to become upper-case, but will only accept the IUPAC codes listed above. + +'Attribute in GFF output' - the last field of each GFF line 'attribute' can be specified using the 'Label' option, this should only include letters/numbers, but without spaces. + +'Select sequence strands to scan' - Only scanning the forward strand of the input sequence is useful if the IUPAC is a palindrome (e.g. CANNTG). + +---- + +.. class:: infomark + +**Credits** + +This Galaxy tool has been developed within the Bioinformatics Core Facility at the University of Manchester. It runs the Scan_IUPAC_output_each_match.pl Perl script that was written by Ian Donaldson. + +Please kindly acknowledge both this Galaxy tool and Scan_IUPAC_output_each_match.pl if you use it. + + + + diff -r 000000000000 -r b67ea47730d3 Scan_IUPAC_output_matches_per_seq.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Scan_IUPAC_output_matches_per_seq.xml Wed Mar 21 06:39:41 2018 -0400 @@ -0,0 +1,92 @@ + + + Counts the matches to a given IUPAC + + motif_tools_macros.xml + + + + + + + + + + + + + + + + + + + + + + + + +.. class:: infomark + +**What it does** + +This tool will find all matches to a DNA pattern in the input DNA sequence, represented by an IUPAC string. The matches are non-overlapping, so searching with 'TTTT' in 'TTTTTTTT' will find two hits to the IUPAC. The output is a table that gives the seqname and the number of matches to the IUPAC per sequence. This version is useful if you want to get a count of IUPAC matches per sequence (e.g. a binding region) and paste the numbers back into a spreadsheet. + +IUPAC = Nucleotide(s): + +A = A + +C = C + +G = G + +T = T + +M = A/C + +R = A/G + +W = A/T + +S = C/G + +Y = C/T + +K = G/T + +V = A/C/G + +H = A/C/T + +D = A/G/T + +B = C/G/T + +N = A/C/G/T + +---- + +.. class:: infomark + +**Options** + +'IUPAC string' - can be entered as upper- or lower-case as the tool will force them to become upper-case, but will only accept the IUPAC codes listed above. + +'Select sequence strands to scan' - Only scanning the forward strand if the input sequence is useful if the IUPAC is a palindrome (e.g. CANNTG). + +---- + +.. class:: infomark + +**Credits** + +This Galaxy tool has been developed within the Bioinformatics Core Facility at the University of Manchester. It runs the Scan_IUPAC_output_matches_per_seq.pl Perl script that was written by Ian Donaldson. + +Please kindly acknowledge both this Galaxy tool and Scan_IUPAC_output_matches_per_seq.pl if you use it. + + + + diff -r 000000000000 -r b67ea47730d3 TFBScluster_candidates_2TFBS.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TFBScluster_candidates_2TFBS.xml Wed Mar 21 06:39:41 2018 -0400 @@ -0,0 +1,116 @@ + + + Identifies clusters of two TFBS + + motif_tools_macros.xml + + + $output_log + + ]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +.. class:: infomark + +**What it does** + +This tool takes two GFF files containing the positions genomic features, typically transcription factor binding sites (TFBS) and looks for clusters with certain properties. The GFF file input could be different TFBS (e.g. combinatorial binding of different factors) or the same TFBS (clustering of multiple instances of the same factor). + +The cluster properties are explained in more detail in the **Options** section. + +---- + +.. class:: infomark + +**Options** + +'TFBS GFF files' - Each file contains genomic coordinates, typically matches between an IUPAC string representing a TFBS and a set of target sequences, such as those from a ChIP-seq experiment. However, the positions could be for any genomic feature over the whole genome. The important thing is that the different files have the same genome build in common. + +'Minimum occurrence of TFBS' - When clusters are determined you can ensure that a minimum number off occurrences from each TFBS are present. + +'Identifier for TFBS' - This allows information about the different TFBS sets to be propogated through to the output. The identifier could be the TFBS name or the IUPAC used to search for the sites, this should only include letters/numbers, but without spaces. + +'Minimum length of clusters' - The length is a window of sequence in which the specified number of TFBS must be located. Initially TFBScluster will identify all cluster matching the input criteria. It will then merge any overlapping clusters, which can result in lengths greater than the input length. + +'Include or exclude overlapping TFBS' - You can choose to exclude any TFBS that overlaps with another when counting the number of co-occurring TFBS. By default such TFBS are excluded as a basic assumption about co-occuring/cooperative TFBS in a module is that both factors can bind at the same time, which they are unlikely to do if their binding sites overlap. + +---- + +.. class:: infomark + +**Credits** + +This Galaxy tool has been developed within the Bioinformatics Core Facility at the University of Manchester. It runs the TFBScluster_candidate.pl Perl script that was written by Ian Donaldson, which is a modification of the script from the original web tool. Articles below: + +http://www.ncbi.nlm.nih.gov/pubmed/15855248 + +http://www.ncbi.nlm.nih.gov/pubmed/16845063 + +Please kindly acknowledge both this Galaxy tool and TFBScluster articles if you use it. + + + + diff -r 000000000000 -r b67ea47730d3 TFBScluster_candidates_3TFBS.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TFBScluster_candidates_3TFBS.xml Wed Mar 21 06:39:41 2018 -0400 @@ -0,0 +1,125 @@ + + + Identifies clusters of three TFBS + + motif_tools_macros.xml + + + $output_log + + ]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +.. class:: infomark + +**What it does** + +This tool takes three GFF files containing the positions genomic features, typically transcription factor binding sites (TFBS) and looks for clusters with certain properties. The GFF file input could be different TFBS (e.g. combinatorial binding of different factors) or the same TFBS (clustering of multiple instances of the same factor). + +The cluster properties are explained in more detail in the **Options** section. + +---- + +.. class:: infomark + +**Options** + +'TFBS GFF files' - Each file contains genomic coordinates, typically matches between an IUPAC string representing a TFBS and a set of target sequences, such as those from a ChIP-seq experiment. However, the positions could be for any genomic feature over the whole genome. The important thing is that the different files have the same genome build in common. + +'Minimum occurrence of TFBS' - When clusters are determined you can ensure that a minimum number off occurrences from each TFBS are present. + +'Identifier for TFBS' - This allows information about the different TFBS sets to be propogated through to the output. The identifier could be the TFBS name or the IUPAC used to search for the sites, this should only include letters/numbers, but without spaces. + +'Minimum length of clusters' - The length is a window of sequence in which the specified number of TFBS must be located. Initially TFBScluster will identify all cluster matching the input criteria. It will then merge any overlapping clusters, which can result in lengths greater than the input length. + +'Include or exclude overlapping TFBS' - You can choose to exclude any TFBS that overlaps with another when counting the number of co-occurring TFBS. By default such TFBS are excluded as a basic assumption about co-occuring/cooperative TFBS in a module is that both factors can bind at the same time, which they are unlikely to do if their binding sites overlap. + +---- + +.. class:: infomark + +**Credits** + +This Galaxy tool has been developed within the Bioinformatics Core Facility at the University of Manchester. It runs the TFBScluster_candidate.pl Perl script that was written by Ian Donaldson, which is a modification of the script from the original web tool. Articles below: + +http://www.ncbi.nlm.nih.gov/pubmed/15855248 + +http://www.ncbi.nlm.nih.gov/pubmed/16845063 + +Please kindly acknowledge both this Galaxy tool and TFBScluster articles if you use it. + + + + diff -r 000000000000 -r b67ea47730d3 motif_tools_macros.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/motif_tools_macros.xml Wed Mar 21 06:39:41 2018 -0400 @@ -0,0 +1,9 @@ + + 1.0.1 + + + perl-bioperl + + + diff -r 000000000000 -r b67ea47730d3 test-data/iupac_each_match.gff --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/iupac_each_match.gff Wed Mar 21 06:39:41 2018 -0400 @@ -0,0 +1,14 @@ +gi|9626372|ref|NC_001422.1| RegexSearch CNS 70 75 . + . IUPAC_or_name +gi|9626372|ref|NC_001422.1| RegexSearch CNS 2282 2287 . + . IUPAC_or_name +gi|9626372|ref|NC_001422.1| RegexSearch CNS 2439 2444 . + . IUPAC_or_name +gi|9626372|ref|NC_001422.1| RegexSearch CNS 2526 2531 . + . IUPAC_or_name +gi|9626372|ref|NC_001422.1| RegexSearch CNS 3218 3223 . + . IUPAC_or_name +gi|9626372|ref|NC_001422.1| RegexSearch CNS 3263 3268 . + . IUPAC_or_name +gi|9626372|ref|NC_001422.1| RegexSearch CNS 3833 3838 . + . IUPAC_or_name +gi|9626372|ref|NC_001422.1| RegexSearch CNS 4622 4627 . + . IUPAC_or_name +gi|9626372|ref|NC_001422.1| RegexSearch CNS 64 69 . - . IUPAC_or_name +gi|9626372|ref|NC_001422.1| RegexSearch CNS 346 351 . - . IUPAC_or_name +gi|9626372|ref|NC_001422.1| RegexSearch CNS 2086 2091 . - . IUPAC_or_name +gi|9626372|ref|NC_001422.1| RegexSearch CNS 3287 3292 . - . IUPAC_or_name +gi|9626372|ref|NC_001422.1| RegexSearch CNS 3530 3535 . - . IUPAC_or_name +gi|9626372|ref|NC_001422.1| RegexSearch CNS 4718 4723 . - . IUPAC_or_name diff -r 000000000000 -r b67ea47730d3 test-data/iupac_matches_per_seq.out --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/iupac_matches_per_seq.out Wed Mar 21 06:39:41 2018 -0400 @@ -0,0 +1,1 @@ +gi|9626372|ref|NC_001422.1| 14 diff -r 000000000000 -r b67ea47730d3 test-data/phix.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/phix.fa Wed Mar 21 06:39:41 2018 -0400 @@ -0,0 +1,78 @@ +>gi|9626372|ref|NC_001422.1| Enterobacteria phage phiX174, complete genome +GAGTTTTATCGCTTCCATGACGCAGAAGTTAACACTTTCGGATATTTCTGATGAGTCGAAAAATTATCTT +GATAAAGCAGGAATTACTACTGCTTGTTTACGAATTAAATCGAAGTGGACTGCTGGCGGAAAATGAGAAA +ATTCGACCTATCCTTGCGCAGCTCGAGAAGCTCTTACTTTGCGACCTTTCGCCATCAACTAACGATTCTG +TCAAAAACTGACGCGTTGGATGAGGAGAAGTGGCTTAATATGCTTGGCACGTTCGTCAAGGACTGGTTTA +GATATGAGTCACATTTTGTTCATGGTAGAGATTCTCTTGTTGACATTTTAAAAGAGCGTGGATTACTATC +TGAGTCCGATGCTGTTCAACCACTAATAGGTAAGAAATCATGAGTCAAGTTACTGAACAATCCGTACGTT +TCCAGACCGCTTTGGCCTCTATTAAGCTCATTCAGGCTTCTGCCGTTTTGGATTTAACCGAAGATGATTT +CGATTTTCTGACGAGTAACAAAGTTTGGATTGCTACTGACCGCTCTCGTGCTCGTCGCTGCGTTGAGGCT +TGCGTTTATGGTACGCTGGACTTTGTGGGATACCCTCGCTTTCCTGCTCCTGTTGAGTTTATTGCTGCCG +TCATTGCTTATTATGTTCATCCCGTCAACATTCAAACGGCCTGTCTCATCATGGAAGGCGCTGAATTTAC +GGAAAACATTATTAATGGCGTCGAGCGTCCGGTTAAAGCCGCTGAATTGTTCGCGTTTACCTTGCGTGTA +CGCGCAGGAAACACTGACGTTCTTACTGACGCAGAAGAAAACGTGCGTCAAAAATTACGTGCGGAAGGAG +TGATGTAATGTCTAAAGGTAAAAAACGTTCTGGCGCTCGCCCTGGTCGTCCGCAGCCGTTGCGAGGTACT +AAAGGCAAGCGTAAAGGCGCTCGTCTTTGGTATGTAGGTGGTCAACAATTTTAATTGCAGGGGCTTCGGC +CCCTTACTTGAGGATAAATTATGTCTAATATTCAAACTGGCGCCGAGCGTATGCCGCATGACCTTTCCCA +TCTTGGCTTCCTTGCTGGTCAGATTGGTCGTCTTATTACCATTTCAACTACTCCGGTTATCGCTGGCGAC +TCCTTCGAGATGGACGCCGTTGGCGCTCTCCGTCTTTCTCCATTGCGTCGTGGCCTTGCTATTGACTCTA +CTGTAGACATTTTTACTTTTTATGTCCCTCATCGTCACGTTTATGGTGAACAGTGGATTAAGTTCATGAA +GGATGGTGTTAATGCCACTCCTCTCCCGACTGTTAACACTACTGGTTATATTGACCATGCCGCTTTTCTT +GGCACGATTAACCCTGATACCAATAAAATCCCTAAGCATTTGTTTCAGGGTTATTTGAATATCTATAACA +ACTATTTTAAAGCGCCGTGGATGCCTGACCGTACCGAGGCTAACCCTAATGAGCTTAATCAAGATGATGC +TCGTTATGGTTTCCGTTGCTGCCATCTCAAAAACATTTGGACTGCTCCGCTTCCTCCTGAGACTGAGCTT +TCTCGCCAAATGACGACTTCTACCACATCTATTGACATTATGGGTCTGCAAGCTGCTTATGCTAATTTGC +ATACTGACCAAGAACGTGATTACTTCATGCAGCGTTACCATGATGTTATTTCTTCATTTGGAGGTAAAAC +CTCTTATGACGCTGACAACCGTCCTTTACTTGTCATGCGCTCTAATCTCTGGGCATCTGGCTATGATGTT +GATGGAACTGACCAAACGTCGTTAGGCCAGTTTTCTGGTCGTGTTCAACAGACCTATAAACATTCTGTGC +CGCGTTTCTTTGTTCCTGAGCATGGCACTATGTTTACTCTTGCGCTTGTTCGTTTTCCGCCTACTGCGAC +TAAAGAGATTCAGTACCTTAACGCTAAAGGTGCTTTGACTTATACCGATATTGCTGGCGACCCTGTTTTG +TATGGCAACTTGCCGCCGCGTGAAATTTCTATGAAGGATGTTTTCCGTTCTGGTGATTCGTCTAAGAAGT +TTAAGATTGCTGAGGGTCAGTGGTATCGTTATGCGCCTTCGTATGTTTCTCCTGCTTATCACCTTCTTGA +AGGCTTCCCATTCATTCAGGAACCGCCTTCTGGTGATTTGCAAGAACGCGTACTTATTCGCCACCATGAT +TATGACCAGTGTTTCCAGTCCGTTCAGTTGTTGCAGTGGAATAGTCAGGTTAAATTTAATGTGACCGTTT +ATCGCAATCTGCCGACCACTCGCGATTCAATCATGACTTCGTGATAAAAGATTGAGTGTGAGGTTATAAC +GCCGAAGCGGTAAAAATTTTAATTTTTGCCGCTGAGGGGTTGACCAAGCGAAGCGCGGTAGGTTTTCTGC +TTAGGAGTTTAATCATGTTTCAGACTTTTATTTCTCGCCATAATTCAAACTTTTTTTCTGATAAGCTGGT +TCTCACTTCTGTTACTCCAGCTTCTTCGGCACCTGTTTTACAGACACCTAAAGCTACATCGTCAACGTTA +TATTTTGATAGTTTGACGGTTAATGCTGGTAATGGTGGTTTTCTTCATTGCATTCAGATGGATACATCTG +TCAACGCCGCTAATCAGGTTGTTTCTGTTGGTGCTGATATTGCTTTTGATGCCGACCCTAAATTTTTTGC +CTGTTTGGTTCGCTTTGAGTCTTCTTCGGTTCCGACTACCCTCCCGACTGCCTATGATGTTTATCCTTTG +AATGGTCGCCATGATGGTGGTTATTATACCGTCAAGGACTGTGTGACTATTGACGTCCTTCCCCGTACGC +CGGGCAATAACGTTTATGTTGGTTTCATGGTTTGGTCTAACTTTACCGCTACTAAATGCCGCGGATTGGT +TTCGCTGAATCAGGTTATTAAAGAGATTATTTGTCTCCAGCCACTTAAGTGAGGTGATTTATGTTTGGTG +CTATTGCTGGCGGTATTGCTTCTGCTCTTGCTGGTGGCGCCATGTCTAAATTGTTTGGAGGCGGTCAAAA +AGCCGCCTCCGGTGGCATTCAAGGTGATGTGCTTGCTACCGATAACAATACTGTAGGCATGGGTGATGCT +GGTATTAAATCTGCCATTCAAGGCTCTAATGTTCCTAACCCTGATGAGGCCGCCCCTAGTTTTGTTTCTG +GTGCTATGGCTAAAGCTGGTAAAGGACTTCTTGAAGGTACGTTGCAGGCTGGCACTTCTGCCGTTTCTGA +TAAGTTGCTTGATTTGGTTGGACTTGGTGGCAAGTCTGCCGCTGATAAAGGAAAGGATACTCGTGATTAT +CTTGCTGCTGCATTTCCTGAGCTTAATGCTTGGGAGCGTGCTGGTGCTGATGCTTCCTCTGCTGGTATGG +TTGACGCCGGATTTGAGAATCAAAAAGAGCTTACTAAAATGCAACTGGACAATCAGAAAGAGATTGCCGA +GATGCAAAATGAGACTCAAAAAGAGATTGCTGGCATTCAGTCGGCGACTTCACGCCAGAATACGAAAGAC +CAGGTATATGCACAAAATGAGATGCTTGCTTATCAACAGAAGGAGTCTACTGCTCGCGTTGCGTCTATTA +TGGAAAACACCAATCTTTCCAAGCAACAGCAGGTTTCCGAGATTATGCGCCAAATGCTTACTCAAGCTCA +AACGGCTGGTCAGTATTTTACCAATGACCAAATCAAAGAAATGACTCGCAAGGTTAGTGCTGAGGTTGAC +TTAGTTCATCAGCAAACGCAGAATCAGCGGTATGGCTCTTCTCATATTGGCGCTACTGCAAAGGATATTT +CTAATGTCGTCACTGATGCTGCTTCTGGTGTGGTTGATATTTTTCATGGTATTGATAAAGCTGTTGCCGA +TACTTGGAACAATTTCTGGAAAGACGGTAAAGCTGATGGTATTGGCTCTAATTTGTCTAGGAAATAACCG +TCAGGATTGACACCCTCCCAATTGTATGTTTTCATGCCTCCAAATCTTGGAGGCTTTTTTATGGTTCGTT +CTTATTACCCTTCTGAATGTCACGCTGATTATTTTGACTTTGAGCGTATCGAGGCTCTTAAACCTGCTAT +TGAGGCTTGTGGCATTTCTACTCTTTCTCAATCCCCAATGCTTGGCTTCCATAAGCAGATGGATAACCGC +ATCAAGCTCTTGGAAGAGATTCTGTCTTTTCGTATGCAGGGCGTTGAGTTCGATAATGGTGATATGTATG +TTGACGGCCATAAGGCTGCTTCTGACGTTCGTGATGAGTTTGTATCTGTTACTGAGAAGTTAATGGATGA +ATTGGCACAATGCTACAATGTGCTCCCCCAACTTGATATTAATAACACTATAGACCACCGCCCCGAAGGG +GACGAAAAATGGTTTTTAGAGAACGAGAAGACGGTTACGCAGTTTTGCCGCAAGCTGGCTGCTGAACGCC +CTCTTAAGGATATTCGCGATGAGTATAATTACCCCAAAAAGAAAGGTATTAAGGATGAGTGTTCAAGATT +GCTGGAGGCCTCCACTATGAAATCGCGTAGAGGCTTTGCTATTCAGCGTTTGATGAATGCAATGCGACAG +GCTCATGCTGATGGTTGGTTTATCGTTTTTGACACTCTCACGTTGGCTGACGACCGATTAGAGGCGTTTT +ATGATAATCCCAATGCTTTGCGTGACTATTTTCGTGATATTGGTCGTATGGTTCTTGCTGCCGAGGGTCG +CAAGGCTAATGATTCACACGCCGACTGCTATCAGTATTTTTGTGTGCCTGAGTATGGTACAGCTAATGGC +CGTCTTCATTTCCATGCGGTGCACTTTATGCGGACACTTCCTACAGGTAGCGTTGACCCTAATTTTGGTC +GTCGGGTACGCAATCGCCGCCAGTTAAATAGCTTGCAAAATACGTGGCCTTATGGTTACAGTATGCCCAT +CGCAGTTCGCTACACGCAGGACGCTTTTTCACGTTCTGGTTGGTTGTGGCCTGTTGATGCTAAAGGTGAG +CCGCTTAAAGCTACCAGTTATATGGCTGTTGGTTTCTATGTGGCTAAATACGTTAACAAAAAGTCAGATA +TGGACCTTGCTGCTAAAGGTCTAGGAGCTAAAGAATGGAACAACTCACTAAAAACCAAGCTGTCGCTACT +TCCCAAGAAGCTGTTCAGAATCAGAATGAGCCGCAACTTCGGGATGAAAATGCTCACAATGACAAATCTG +TCCACGGAGTGCTTAATCCAACTTACCAAGCTGGGTTACGACGCGACGCCGTTCAACCAGATATTGAAGC +AGAACGCAAAAAGAGAGATGAGATTGAGGCTGGGAAAAGTTACTGTAGCCGACGTTTTGGCGGCGCAACC +TGTGACGACAAATCTGCTCAAATTTATGCGCGCTTCGATAAAAATGATTGGCGTATCCAACCTGCA