Next changeset 1:8b26adf64adc (2013-11-05) |
Commit message:
Initial upload |
added:
._BSseeker2 BSseeker2/._AUTHORS BSseeker2/._FilterReads.py BSseeker2/._README.md BSseeker2/._RELEASE_NOTES BSseeker2/._bs_align BSseeker2/._bs_index BSseeker2/._bs_seeker2-align.py BSseeker2/._bs_seeker2-build.py BSseeker2/._bs_seeker2-call_methylation.py BSseeker2/._bs_utils BSseeker2/._galaxy BSseeker2/AUTHORS BSseeker2/FilterReads.py BSseeker2/LICENSE BSseeker2/README.md BSseeker2/RELEASE_NOTES BSseeker2/bs_align/.___init__.py BSseeker2/bs_align/._bs_align_utils.py BSseeker2/bs_align/._bs_pair_end.py BSseeker2/bs_align/._bs_rrbs.py BSseeker2/bs_align/._bs_single_end.py BSseeker2/bs_align/._output.py BSseeker2/bs_align/__init__.py BSseeker2/bs_align/bs_align_utils.py BSseeker2/bs_align/bs_pair_end.py BSseeker2/bs_align/bs_rrbs.py BSseeker2/bs_align/bs_single_end.py BSseeker2/bs_align/output.py BSseeker2/bs_index/.___init__.py BSseeker2/bs_index/._rrbs_build.py BSseeker2/bs_index/._wg_build.py BSseeker2/bs_index/__init__.py BSseeker2/bs_index/rrbs_build.py BSseeker2/bs_index/wg_build.py BSseeker2/bs_seeker2-align.py BSseeker2/bs_seeker2-build.py BSseeker2/bs_seeker2-call_methylation.py BSseeker2/bs_utils/.___init__.py BSseeker2/bs_utils/._utils.py BSseeker2/bs_utils/__init__.py BSseeker2/bs_utils/utils.py BSseeker2/galaxy/.___init__.py BSseeker2/galaxy/._bs_seeker2_wrapper.py BSseeker2/galaxy/._bs_seeker2_wrapper.xml BSseeker2/galaxy/__init__.py BSseeker2/galaxy/bs_seeker2_wrapper.py BSseeker2/galaxy/bs_seeker2_wrapper.xml |
b |
diff -r 000000000000 -r e6df770c0e58 ._BSseeker2 |
b |
Binary file ._BSseeker2 has changed |
b |
diff -r 000000000000 -r e6df770c0e58 BSseeker2/._AUTHORS |
b |
Binary file BSseeker2/._AUTHORS has changed |
b |
diff -r 000000000000 -r e6df770c0e58 BSseeker2/._FilterReads.py |
b |
Binary file BSseeker2/._FilterReads.py has changed |
b |
diff -r 000000000000 -r e6df770c0e58 BSseeker2/._README.md |
b |
Binary file BSseeker2/._README.md has changed |
b |
diff -r 000000000000 -r e6df770c0e58 BSseeker2/._RELEASE_NOTES |
b |
Binary file BSseeker2/._RELEASE_NOTES has changed |
b |
diff -r 000000000000 -r e6df770c0e58 BSseeker2/._bs_align |
b |
Binary file BSseeker2/._bs_align has changed |
b |
diff -r 000000000000 -r e6df770c0e58 BSseeker2/._bs_index |
b |
Binary file BSseeker2/._bs_index has changed |
b |
diff -r 000000000000 -r e6df770c0e58 BSseeker2/._bs_seeker2-align.py |
b |
Binary file BSseeker2/._bs_seeker2-align.py has changed |
b |
diff -r 000000000000 -r e6df770c0e58 BSseeker2/._bs_seeker2-build.py |
b |
Binary file BSseeker2/._bs_seeker2-build.py has changed |
b |
diff -r 000000000000 -r e6df770c0e58 BSseeker2/._bs_seeker2-call_methylation.py |
b |
Binary file BSseeker2/._bs_seeker2-call_methylation.py has changed |
b |
diff -r 000000000000 -r e6df770c0e58 BSseeker2/._bs_utils |
b |
Binary file BSseeker2/._bs_utils has changed |
b |
diff -r 000000000000 -r e6df770c0e58 BSseeker2/._galaxy |
b |
Binary file BSseeker2/._galaxy has changed |
b |
diff -r 000000000000 -r e6df770c0e58 BSseeker2/AUTHORS --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/BSseeker2/AUTHORS Fri Jul 12 18:47:28 2013 -0400 |
b |
@@ -0,0 +1,7 @@ +BS-Seeker2 authors + +Pao-Yang Chen <paoyang@gate.sinica.edu.tw> +Petko Fiziev <peplafi@gmail.com> +Weilong Guo <guoweilong@gmail.com> + +May 2013 |
b |
diff -r 000000000000 -r e6df770c0e58 BSseeker2/FilterReads.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/BSseeker2/FilterReads.py Fri Jul 12 18:47:28 2013 -0400 |
[ |
@@ -0,0 +1,237 @@ +#!/usr/bin/python + +import sys +import os +import os.path +import re + +# Define the Dictionary & the Reverse Dictionary for barcode + +# =========================================== +# +def FilterFastq (infile, outfile) : + try: + INFILE = open(infile, 'r') + except IOError: + print "\n[Error]:\n\t File cannot be open: ", infile; + exit(-1) + try: + OUTFILE = open(outfile, 'w') + except IOError: + print "\n[Error]:\n\t File cannot be open: ", outfile; + exit(-1) + Dict = {} + count = 0 + total = 0 + while 1 : + line1 = INFILE.readline() + if not line1 : + break + line1 = line1 + line2 = INFILE.readline() + line3 = INFILE.readline() + line4 = INFILE.readline() + total += 1 + # print line2 + if ( line2 in Dict ) : + Dict[line2] += 1 + else : + Dict[line2] = 1 + OUTFILE.write(line1) + OUTFILE.write(line2) + OUTFILE.write(line3) + OUTFILE.write(line4) + count += 1 + INFILE.close() + OUTFILE.close() + print "Count of total raw reads: ", total + print "Count of reads left: ", count + print "Percentage of kept reads: %.2f" % (count * 100.0 /total), "%" + + +# =============================== +# + +def FilterSequence (infile, outfile) : + try: + INFILE = open(infile, 'r') + except IOError: + print "\n[Error]:\n\t File cannot be open: ", infile; + exit(-1) + try: + OUTFILE = open(outfile, 'w') + except IOError: + print "\n[Error]:\n\t File cannot be open: ", outfile; + exit(-1) + Dict = {} + count = 0 + total = 0 + for line in INFILE: + line = line.strip() + total += 1 + if ( line in Dict ) : + Dict[line] += 1 + else : + Dict[line] = 1 + count += 1 + OUTFILE.write(line + "\n") + INFILE.close() + OUTFILE.close() + print "Count of total raw reads: ", total + print "Count of reads left: ", count + print "Percentage of kept reads: %.2f" % (count * 100.0 /total), "%" + +# =============================== +# SN603 WA047 6 1101 41.40 99.10 0 1 .GGGA.......TTAG.............. @SZ__@@@@@@@RR__@@@@@@@@@@@@@@ 0 +# +def FilterQseq (infile, outfile, keepquality) : + if keepquality : + print "User specified '-k' and read quality will not be considered" + else : + print "Reads with PF=0 will be filtered" + + try: + INFILE = open(infile, 'r') + except IOError: + print "\n[Error]:\n\t File cannot be open: ", infile; + exit(-1) + try: + OUTFILE = open(outfile, 'w') + except IOError: + print "\n[Error]:\n\t File cannot be open: ", outfile; + exit(-1) + Dict = {} + count = 0 + total = 0 + for line in INFILE: + tokens = line.strip().split() + total += 1 + if ( (not keepquality and tokens[10] == "1") or keepquality ) : + if ( tokens[8] in Dict ) : + Dict[tokens[8]] += 1 + else : + Dict[tokens[8]] = 1 + count += 1 + OUTFILE.write(line) + + INFILE.close() + OUTFILE.close() + print "Count of total raw reads: ", total + print "Count of reads left: ", count + print "Percentage of kept reads: %.2f" % (count * 100.0 /total), "%" + + +# =============================== +# + +def FilterFasta (infile, outfile) : + try: + INFILE = open(infile, 'r') + except IOError: + print "\n[Error]:\n\t File cannot be open: ", infile; + exit(-1) + try: + OUTFILE = open(outfile, 'w') + except IOError: + print "\n[Error]:\n\t File cannot be open: ", outfile; + exit(-1) + Dict = {} + count = 0 + total = 0 + name = "" + read = "" + for line in INFILE: + line = line.strip() + if (line[0] == '>') : + total += 1 + if ( name != "" ) : + if ( read in Dict ) : + Dict[read] += 1 + else : + Dict[read] = 1 + count += 1 + OUTFILE.write(">" + name + "\n") + OUTFILE.write(read + "\n") + name = line[1:] + read = "" + else : + read = read + line + if (name != "") : + if ( read in Dict ) : + Dict[read] += 1 + else : + Dict[read] = 1 + count += 1 + OUTFILE.write(">" + name + "\n") + OUTFILE.write(read + "\n") + print "Count of total raw reads: ", total + print "Count of reads left: ", count + print "Percentage of kept reads: %.2f" % (count * 100.0 /total), "%" + + + +# ============================== +# Decide branch functions for different format + +def FilterReads (infile, outfile, keepquality): + try: + INFILE = open(infile, 'r') + except IOError: + print "\n[Error]:\n\t File cannot be open: ", infile; + exit(-1) + i = 0 + # Count the numbers of barcodes in first 10000 lines + line = INFILE.readline() + tokens = line.strip().split() + input_format=""; + if line[0]=="@" : + input_format = "FastQ" + n_fastq = 0 + elif len(tokens) == 1 and line[0] != ">": + input_format = "sequence" + elif len(tokens) == 11: + input_format = "qseq" + elif line[0]==">" : + input_format = "fasta" + INFILE.close() + + print "Input file format detected: ", input_format + + if input_format == "FastQ" : + FilterFastq(infile, outfile) + elif input_format == "sequence" : + FilterSequence(infile, outfile) + elif input_format == "qseq" : + FilterQseq(infile, outfile, keepquality) + elif input_format == "fasta" : + FilterFasta(infile, outfile) + + +from optparse import OptionParser + +# =========================================== +def main(): + usage = "Usage: %prog -i <input> -o <output> [-k]\n" \ + "Author : Guo, Weilong; guoweilong@gmail.com; 2012-11-10\n" \ + "Last Update: 2013-04-01" \ + "Description: Unique reads for qseq/fastq/fasta/sequencce,\n" \ + " and filter low quality reads in qseq file." + parser = OptionParser(usage) + parser.add_option("-i", dest="infile", + help="Name of the input qseq/fastq/fasta/sequence file", metavar="FILE") + parser.add_option("-o", dest="outfile", + help="Name of the output file", metavar="FILE") + parser.add_option("-k", dest="keepquality", default = False, action = "store_true", + help="Would not filter low quality reads if specified") + (options, args) = parser.parse_args() + + if (options.infile is None) or (options.outfile is None) : + print parser.print_help() + exit(-1) + FilterReads(options.infile, options.outfile, options.keepquality) + + +# =========================================== +if __name__ == "__main__": + main() + |
b |
diff -r 000000000000 -r e6df770c0e58 BSseeker2/LICENSE --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/BSseeker2/LICENSE Fri Jul 12 18:47:28 2013 -0400 |
b |
@@ -0,0 +1,22 @@ +The MIT License (MIT) + +Copyright (c) 2012-2013 BS-Seeker2 Developers. All Rights Reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + |
b |
diff -r 000000000000 -r e6df770c0e58 BSseeker2/README.md --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/BSseeker2/README.md Fri Jul 12 18:47:28 2013 -0400 |
[ |
b'@@ -0,0 +1,590 @@\n+BS-Seeker2\n+=========\n+\n+BS-Seeker2 (BS Seeker 2) performs accurate and fast mapping of bisulfite-treated short reads. BS-Seeker2 is an updated version on BS-Seeker.\n+\n+0. Availability\n+============\n+\n+Homepage of [BS-Seeker2](http://pellegrini.mcdb.ucla.edu/BS_Seeker2/).\n+\n+The source code for this package is available from\n+[https://github.com/BSSeeker/BSseeker2](https://github.com/BSSeeker/BSseeker2).\n+Also, you can use an instance of BS-Seeker 2 in Galaxy from [http://galaxy.hoffman2.idre.ucla.edu](http://galaxy.hoffman2.idre.ucla.edu).\n+(Label: "NGS: Methylation Mapping"/"Methylation Map with BS Seeker2")\n+\n+\n+1. Remarkable new features\n+============\n+* Reduced index for RRBS, accelerating the mapping speed and increasing mappability\n+* Allowing local alignment with Bowtie 2, increased the mappability\n+\n+2. Other features\n+============\n+* Supported library types\n+\t- whole genome-wide bisulfite sequencing (WGBS)\n+\t- reduced representative bisulfite sequencing (RRBS)\n+\n+* Supported formats for input file\n+\t- [fasta](http://en.wikipedia.org/wiki/FASTA_format)\n+\t- [fastq](http://en.wikipedia.org/wiki/FASTQ_format)\n+\t- [qseq](http://jumpgate.caltech.edu/wiki/QSeq)\n+\t- pure sequence (one-line one-sequence)\n+\n+* Supported alignment tools\n+\t- [bowtie](http://bowtie-bio.sourceforge.net/index.shtml) : Single-seed\n+\t- [bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml) : Multiple-seed, gapped-alignment\n+\t\t- [local alignment](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#local-alignment-example)\n+\t\t- [end-to-end alignment](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#end-to-end-alignment-example)\n+\t- [SOAP](http://soap.genomics.org.cn/)\n+\n+* Supported formats for mapping results\n+\t- [BAM](http://genome.ucsc.edu/FAQ/FAQformat.html#format5.1)\n+\t- [SAM](http://samtools.sourceforge.net/)\n+\t- [BS-seeker 1](http://pellegrini.mcdb.ucla.edu/BS_Seeker/USAGE.html)\n+\n+3. System requirements\n+============\n+\n+* Linux or Mac OS platform\n+* One of the following Aligner\n+ - [bowtie](http://bowtie-bio.sourceforge.net/)\n+ - [bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/) (Recommend)\n+ - [soap](http://soap.genomics.org.cn/)\n+* [Python](http://www.python.org/download/) (Version 2.6 +)\n+\n+ (It is normally pre-installed in Linux. Type " python -V" to see the installed version.)\n+\n+* [pysam](http://code.google.com/p/pysam/) package is needed.\n+\n+ (Read "Questions & Answers" if you have problem when installing this package.)\n+\n+\n+\n+4. Modules\' descriptions\n+============\n+\n+(0) FilterReads.py\n+------------\n+\n+Optional and independent module.\n+Some reads would be extremely amplified during the PCR. This script helps you get unique reads before doing the mapping. You can decide whether or not to filter reads before doing the mapping.\n+\n+##Usage :\n+\n+\t$ python FilterReads.py\n+\tUsage: FilterReads.py -i <input> -o <output> [-k]\n+\tAuthor : Guo, Weilong; guoweilong@gmail.com; 2012-11-10\n+\tUnique reads for qseq/fastq/fasta/sequencce, and filter\n+\tlow quality file in qseq file.\n+\n+\tOptions:\n+\t -h, --help show this help message and exit\n+\t -i FILE Name of the input qseq/fastq/fasta/sequence file\n+\t -o FILE Name of the output file\n+\t -k Would not filter low quality reads if specified\n+\n+\n+##Tip :\n+\n+- This step is not suggested for RRBS library, as reads from RRBS library would more likely from the same location.\n+\n+\n+(1) bs_seeker2-build.py\n+------------\n+\n+Module to build the index for BS-Seeker2.\n+\n+\n+##Usage :\n+\n+ $ python bs_seeker2-build.py -h\n+ Usage: bs_seeker2-build.py [options]\n+\n+ Options:\n+ -h, --help show this help message and exit\n+ -f FILE, --file=FILE Input your reference genome file (fasta)\n+ --aligner=ALIGNER Aligner program to perform the analysis: bowtie,\n+ bowtie2, soap [Default: bowtie2]\n+ -p PATH, --path=PATH Path to the aligner program. Defaults:\n+ bowtie: /u/home'..b't C/T at this position.\n+\n+\n+\n+Contact Information\n+============\n+\n+If you still have questions on BS-Seeker 2, or you find bugs when using BS-Seeker 2, or you have suggestions, please write email to guoweilong@gmail.com (Weilong Guo).\n+\n+\n+\n+Questions & Answers\n+============\n+\n+(1) Speed-up your alignment\n+\n+Q: "It takes me days to do the alignment for one lane" ...\n+\n+A: Yes, alignment is a time-consuming work, especially because the sequencing depth is increasing. An efficient way to align is :\n+\n+ i. cut the original sequence file into multiple small pieces;\n+\n+ Ex: split -l 4000000 input.fq\n+\n+ ii. align them in parallel;\n+ iii. merge all the BAM files into a single one before running "bs-seeker2_call-methylation.py" (user "samtools merge" command).\n+\n+ Ex: samtools merge out.bam in1.bam in2.bam in3.bam\n+\n+(2) read in BAM/SAM\n+\n+Q: Is the read sequence in BAM/SAM file is the same as my original one?\n+\n+A: NO. They are different for several reasons.\n+\n+ i. For RRBS, some reads are short because of trimming of the adapters\n+ ii. For read mapping on Crick (-) strand, the reads are in fact the antisense version of the original sequence, opposite both in nucleotides and direction\n+\n+(3) "Pysam" package related problem\n+\n+Q: I\'m normal account user for Linux(Cluster). I can\'t install "pysam". I get following error massages:\n+\n+\n+ $ python setup.py install\n+ running install\n+ error: can\'t create or remove files in install directory\n+ The following error occurred while trying to add or remove files in the\n+ installation directory:\n+ [Errno 13] Permission denied: \'/usr/lib64/python2.6/site-packages/test-easy-install-26802.write-test\'\n+ ...\n+\n+\n+A: You can ask the administrator of your cluster to install pysam. If you don\'t want to bother him/her, you might need to build your own python, and then install the "pysam" package. The following script could be helpful for you.\n+\n+\n+ mkdir ~/install\n+ cd ~/install/\n+\n+ # install python\n+ wget http://www.python.org/ftp/python/2.7.4/Python-2.7.4.tgz # download the python from websites\n+ tar zxvf Python-2.7.4.tgz # decompress\n+ cd Python-2.7.4\n+ ./configure --prefix=`pwd`\n+ make\n+ make install\n+\n+ # Add the path of Python to $PATH\n+ # Please add the following line to file ~/.bashrc\n+\n+ export PATH=~/install/Python-2.7.4:$PATH\n+\n+ # save the ~/.bashrc file\n+ source ~/.bashrc\n+\n+ # install pysam package\n+ wget https://pysam.googlecode.com/files/pysam-0.7.4.tar.gz\n+ tar zxvf pysam-0.7.4.tar.gz\n+ cd pysam-0.7.4\n+ python setup.py build\n+ python setup.py install\n+ # re-login the shell after finish installing pysam\n+\n+ # install BS-Seeker2\n+ wget https://github.com/BSSeeker/BSseeker2/archive/master.zip\n+ mv master BSSeeker2.zip\n+ unzip BSSeeker2.zip\n+ cd BSseeker2-master/\n+\n+\n+\xef\xbc\x884\xef\xbc\x89Run BS-Seeker2\n+\n+Q: Can I add the path of BS-Seeker2\'s *.py to the $PATH, so I can call\n+BS-Seeker2 from anywhere?\n+\n+A: If you\'re using the "python" from path "/usr/bin/python", you can directly\n+add the path of BS-Seeker2 in file "~/.bash_profile" (bash) or "~/.profile"\n+(other shell) or "~/.bashrc" (per-interactive-shell startup).\n+But if you are using python under other directories, you might need to modify\n+BS-Seeker2\'s script first. For example, if your python path is "/my_python/python",\n+please change the first line in "bs_seeker-build.py", "bs_seeker-align.py" and\n+"bs_seeker-call_methylation.py" to\n+\n+ #!/my_python/python\n+\n+Then add\n+\n+ export PATH=/path/to/BS-Seeker2/:$PATH\n+\n+to file "~/.bash_profile" (e.g.), and source the file:\n+\n+ source ~/.bash_profile\n+\n+Then you can use BS-Seeker2 globally by typing:\n+\n+ bs_seeker_build.py -h\n+ bs_seeker-align.py -h\n+ bs_seeker-call_methylation.py -h\n+\n+\n+\n+\n' |
b |
diff -r 000000000000 -r e6df770c0e58 BSseeker2/RELEASE_NOTES --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/BSseeker2/RELEASE_NOTES Fri Jul 12 18:47:28 2013 -0400 |
b |
@@ -0,0 +1,7 @@ + +v2.0.3 - May 19, 2013 +1. Fix bug in utils.py +2. Add UPDATE file + + + |
b |
diff -r 000000000000 -r e6df770c0e58 BSseeker2/bs_align/.___init__.py |
b |
Binary file BSseeker2/bs_align/.___init__.py has changed |
b |
diff -r 000000000000 -r e6df770c0e58 BSseeker2/bs_align/._bs_align_utils.py |
b |
Binary file BSseeker2/bs_align/._bs_align_utils.py has changed |
b |
diff -r 000000000000 -r e6df770c0e58 BSseeker2/bs_align/._bs_pair_end.py |
b |
Binary file BSseeker2/bs_align/._bs_pair_end.py has changed |
b |
diff -r 000000000000 -r e6df770c0e58 BSseeker2/bs_align/._bs_rrbs.py |
b |
Binary file BSseeker2/bs_align/._bs_rrbs.py has changed |
b |
diff -r 000000000000 -r e6df770c0e58 BSseeker2/bs_align/._bs_single_end.py |
b |
Binary file BSseeker2/bs_align/._bs_single_end.py has changed |
b |
diff -r 000000000000 -r e6df770c0e58 BSseeker2/bs_align/._output.py |
b |
Binary file BSseeker2/bs_align/._output.py has changed |
b |
diff -r 000000000000 -r e6df770c0e58 BSseeker2/bs_align/__init__.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/BSseeker2/bs_align/__init__.py Fri Jul 12 18:47:28 2013 -0400 |
b |
@@ -0,0 +1,1 @@ +__author__ = 'pf' |
b |
diff -r 000000000000 -r e6df770c0e58 BSseeker2/bs_align/bs_align_utils.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/BSseeker2/bs_align/bs_align_utils.py Fri Jul 12 18:47:28 2013 -0400 |
[ |
b'@@ -0,0 +1,385 @@\n+from bs_utils.utils import *\n+import re\n+\n+\n+BAM_MATCH = 0\n+BAM_INS = 1\n+BAM_DEL = 2\n+BAM_SOFTCLIP = 4\n+\n+CIGAR_OPS = {\'M\' : BAM_MATCH, \'I\' : BAM_INS, \'D\' : BAM_DEL, \'S\' : BAM_SOFTCLIP}\n+\n+\n+def N_MIS(r,g):\n+ mismatches = 0\n+ if len(r)==len(g):\n+ for i in xrange(len(r)):\n+ if r[i] != g[i] and r[i] != "N" and g[i] != "N" and not(r[i] == \'T\' and g[i] == \'C\'):\n+ mismatches += 1\n+ return mismatches\n+\n+\n+#----------------------------------------------------------------\n+\n+"""\n+Exmaple:\n+========\n+Read : ACCGCGTTGATCGAGTACGTACGTGGGTC\n+Adapter : ....................ACGTGGGTCCCG\n+========\n+\n+no_mismatch : the maximum number allowed for mismatches\n+\n+Algorithm: (allowing 1 mismatch)\n+========\n+-Step 1: \n+ ACCGCGTTGATCGAGTACGTACGTGGGTC\n+ ||XX\n+ ACGTGGGTCCCG\n+-Step 2: \n+ ACCGCGTTGATCGAGTACGTACGTGGGTC\n+ X||X\n+ .ACGTGGGTCCCG\n+-Step 3: \n+ ACCGCGTTGATCGAGTACGTACGTGGGTC\n+ XX\n+ ..ACGTGGGTCCCG\n+-Step ...\n+-Step N: \n+ ACCGCGTTGATCGAGTACGTACGTGGGTC\n+ |||||||||\n+ ....................ACGTGGGTCCCG\n+Success & return!\n+========\n+\n+"""\n+\n+def RemoveAdapter ( read, adapter, no_mismatch ) :\n+ lr = len(read)\n+ la = len(adapter)\n+ for i in xrange( lr - no_mismatch ) :\n+ read_pos = i\n+ adapter_pos = 0\n+ count_no_mis = 0\n+ while (adapter_pos < la) and (read_pos < lr) :\n+ if (read[read_pos] == adapter[adapter_pos]) :\n+ read_pos = read_pos + 1\n+ adapter_pos = adapter_pos + 1\n+ else :\n+ count_no_mis = count_no_mis + 1\n+ if count_no_mis > no_mismatch :\n+ break\n+ else :\n+ read_pos = read_pos + 1\n+ adapter_pos = adapter_pos + 1\n+ # while_end\n+\n+ if adapter_pos == la or read_pos == lr :\n+ return read[:i]\n+ # for_end\n+ return read\n+\n+\n+def Remove_5end_Adapter ( read, adapter, no_mismatch) :\n+ lr = len(read)\n+ la = len(adapter)\n+ for i in xrange (la - no_mismatch) :\n+ read_pos = 0\n+ adapter_pos = i\n+ count_no_mis = 0\n+ while (adapter_pos < la) and (read_pos < lr) :\n+ if (read[read_pos] == adapter[adapter_pos]) :\n+ adapter_pos = adapter_pos + 1\n+ read_pos = read_pos + 1\n+ else :\n+ count_no_mis = count_no_mis + 1\n+ if count_no_mis > no_mismatch :\n+ break\n+ else : \n+ read_pos = read_pos + 1\n+ adapter_pos = adapter_pos + 1\n+ # while_end\n+ if adapter_pos == la :\n+ return read[(la-i):]\n+\n+\n+def next_nuc(seq, pos, n):\n+ """ Returns the nucleotide that is n places from pos in seq. Skips gap symbols.\n+ """\n+ i = pos + 1\n+ while i < len(seq):\n+ if seq[i] != \'-\':\n+ n -= 1\n+ if n == 0: break\n+ i += 1\n+ if i < len(seq) :\n+ return seq[i]\n+ else :\n+ return \'N\'\n+\n+\n+\n+def methy_seq(read, genome):\n+ H = [\'A\', \'C\', \'T\']\n+ m_seq = []\n+ xx = "-"\n+ for i in xrange(len(read)):\n+\n+ if genome[i] == \'-\':\n+ continue\n+\n+ elif read[i] != \'C\' and read[i] != \'T\':\n+ xx = "-"\n+\n+ elif read[i] == "T" and genome[i] == "C": #(unmethylated):\n+ nn1 = next_nuc(genome, i, 1)\n+ if nn1 == "G":\n+ xx = "x"\n+ elif nn1 in H :\n+ nn2 = next_nuc(genome, i, 2)\n+ if nn2 == "G":\n+ xx = "y"\n+ elif nn2 in H :\n+ xx = "z"\n+\n+ elif read[i] == "C" and genome[i] == "C": #(methylated):\n+ nn1 = next_nuc(genome, i, 1)\n+\n+ if nn1 == "G":\n+ xx = "X"\n+ elif nn1 in H :\n+ nn2 = next_nuc(genome, i, 2)\n+\n+ if nn2 == "G":\n+ xx = "Y"\n+ '..b' header1, chr1, location1, no_mismatch1, mate1, strand1, cigar1 = parse_SOAP(line)\n+ header2, _ , location2, no_mismatch2, _, strand2, cigar2 = parse_SOAP(input.next())\n+\n+ if mate1 == \'b\':\n+ location1, location2 = location2, location1\n+ strand1, strand2 = strand2, strand1\n+ ciga1, cigar2 = cigar2, cigar1\n+\n+\n+ if header1 and header2 and strand1 == \'+\' and strand2 == \'-\':\n+ yield header1, chr1, no_mismatch1 + no_mismatch2, location1, cigar1, location2, cigar2\n+\n+ else:\n+ for line in input:\n+ header, chr, location, no_mismatch, _, strand, cigar = parse_SOAP(line)\n+ if header and strand == \'+\':\n+ yield header, chr, location, no_mismatch, cigar\n+ elif format == RMAP :\n+ if pair_end :\n+ todo = 0\n+ # to do\n+ else :\n+ for line in input:\n+ header, chr, location, read_len, no_mismatch, strand = parse_RMAP(line)\n+ cigar = str(read_len) + "M"\n+ yield header, chr, location, no_mismatch, cigar\n+\n+ input.close()\n+\n+\n+def parse_cigar(cigar_string):\n+ i = 0\n+ prev_i = 0\n+ cigar = []\n+ while i < len(cigar_string):\n+ if cigar_string[i] in CIGAR_OPS:\n+ cigar.append((CIGAR_OPS[cigar_string[i]], int(cigar_string[prev_i:i])))\n+ prev_i = i + 1\n+ i += 1\n+ return cigar\n+\n+def get_read_start_end_and_genome_length(cigar):\n+ r_start = cigar[0][1] if cigar[0][0] == BAM_SOFTCLIP else 0\n+ r_end = r_start\n+ g_len = 0\n+ for edit_op, count in cigar:\n+ if edit_op == BAM_MATCH:\n+ r_end += count\n+ g_len += count\n+ elif edit_op == BAM_INS:\n+ r_end += count\n+ elif edit_op == BAM_DEL:\n+ g_len += count\n+ return r_start, r_end, g_len # return the start and end in the read and the length of the genomic sequence\n+ # r_start : start position on the read\n+ # r_end : end position on the read\n+ # g_len : length of the mapped region on genome\n+\n+\n+def cigar_to_alignment(cigar, read_seq, genome_seq):\n+ """ Reconstruct the pairwise alignment based on the CIGAR string and the two sequences\n+ """\n+\n+ # reconstruct the alignment\n+ r_pos = cigar[0][1] if cigar[0][0] == BAM_SOFTCLIP else 0\n+ g_pos = 0\n+ r_aln = \'\'\n+ g_aln = \'\'\n+ for edit_op, count in cigar:\n+ if edit_op == BAM_MATCH:\n+ r_aln += read_seq[r_pos : r_pos + count]\n+ g_aln += genome_seq[g_pos : g_pos + count]\n+ r_pos += count\n+ g_pos += count\n+ elif edit_op == BAM_DEL:\n+ r_aln += \'-\'*count\n+ g_aln += genome_seq[g_pos : g_pos + count]\n+ g_pos += count\n+ elif edit_op == BAM_INS:\n+ r_aln += read_seq[r_pos : r_pos + count]\n+ g_aln += \'-\'*count\n+ r_pos += count\n+\n+ return r_aln, g_aln\n+\n+\n+\n+# return sequence is [start, end), not include \'end\'\n+def get_genomic_sequence(genome, start, end, strand = \'+\'):\n+ if strand != \'+\' and strand != \'-\' :\n+ print "[Bug] get_genomic_sequence input should be \\\'+\\\' or \\\'-\\\'."\n+ exit(-1)\n+ if start > 1:\n+ prev = genome[start-2:start]\n+ elif start == 1:\n+ prev = \'N\'+genome[0]\n+ else:\n+ prev = \'NN\'\n+\n+ if end < len(genome) - 1:\n+ next = genome[end: end + 2]\n+ elif end == len(genome) - 1:\n+ next = genome[end] + \'N\'\n+ else:\n+ next = \'NN\'\n+ origin_genome = genome[start:end]\n+\n+ if strand == \'-\':\n+ # reverse complement everything if strand is \'-\'\n+ revc = reverse_compl_seq(\'%s%s%s\' % (prev, origin_genome, next))\n+ prev, origin_genome, next = revc[:2], revc[2:-2], revc[-2:]\n+\n+ return origin_genome, next, \'%s_%s_%s\' % (prev, origin_genome, next)\n+ # next : next two nucleotides\n\\ No newline at end of file\n' |
b |
diff -r 000000000000 -r e6df770c0e58 BSseeker2/bs_align/bs_pair_end.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/BSseeker2/bs_align/bs_pair_end.py Fri Jul 12 18:47:28 2013 -0400 |
[ |
b'@@ -0,0 +1,1011 @@\n+\xef\xbb\xbfimport fileinput, os, random, math\n+from bs_utils.utils import *\n+from bs_align_utils import *\n+#----------------------------------------------------------------\n+def extract_mapping(ali_file):\n+ unique_hits = {}\n+ non_unique_hits = {}\n+ header0 = ""\n+ family = []\n+ for header, chr, no_mismatch, location1, cigar1, location2, cigar2 in process_aligner_output(ali_file, pair_end = True):\n+ #------------------------\n+ if header != header0:\n+\n+ # --- output ----\n+ if len(family) == 1:\n+ unique_hits[header0] = family[0]\n+ elif len(family) > 1:\n+ min_lst = min(family, key = lambda x: x[0])\n+ max_lst = max(family, key = lambda x: x[0])\n+\n+ if min_lst[0] < max_lst[0]:\n+ unique_hits[header0] = min_lst\n+ else:\n+ non_unique_hits[header0] = min_lst[0]\n+\n+ header0 = header\n+ family = []\n+ family.append((no_mismatch, chr, location1, cigar1, location2, cigar2))\n+\n+ if len(family) == 1:\n+ unique_hits[header0] = family[0]\n+ elif len(family) > 1:\n+ min_lst = min(family, key = lambda x: x[0])\n+ max_lst = max(family, key = lambda x: x[0])\n+\n+ if min_lst[0] < max_lst[0]:\n+ unique_hits[header0] = min_lst\n+ else:\n+ non_unique_hits[header0] = min_lst[0]\n+\n+ return unique_hits, non_unique_hits\n+\n+def _extract_mapping(ali_file):\n+ U = {}\n+ R = {}\n+ header0 = ""\n+ family = []\n+ for line in fileinput.input(ali_file):\n+ l = line.split()\n+ header = l[0][:-2]\n+ chr = str(l[1])\n+ location = int(l[2])\n+ #no_hits=int(l[4])\n+ #-------- mismatches -----------\n+ if len(l) == 4:\n+ no_mismatch = 0\n+ elif len(l) == 5:\n+ no_mismatch = l[4].count(":")\n+ else:\n+ print l\n+ #------------------------\n+ if header != header0:\n+ #--------------------\n+ if header0 != "":\n+ # --- output ----\n+ if len(family) == 1:\n+ U[header0] = family[0]\n+ else:\n+ if family[0][0] < family[1][0]:\n+ U[header0] = family[0]\n+ elif family[1][0] < family[0][0]:\n+ U[header0] = family[1]\n+ else:\n+ R[header0] = family[0][0]\n+ family=[]\n+ # ---------------\n+ header0 = header\n+ family = [[no_mismatch, chr, location]]\n+ member = 1\n+ elif header == header0:\n+ if member == 1:\n+ family[-1][0] += no_mismatch\n+ family[-1].append(location)\n+ member = 2\n+ elif member == 2:\n+ family.append([no_mismatch, chr, location])\n+ member = 1\n+ #------------------------------\n+\n+ fileinput.close()\n+ return U, R\n+\n+\n+#----------------------------------------------------------------\n+\n+def bs_pair_end(main_read_file_1,\n+ main_read_file_2,\n+ asktag,\n+ adapter_file,\n+ cut1,\n+ cut2, # add cut3 and cut4?\n+ no_small_lines,\n+ max_mismatch_no,\n+ aligner_command,\n+ db_path,\n+ tmp_path,\n+ outfile, XS_pct, XS_count, show_multiple_hit=False):\n+\n+\n+ #----------------------------------------------------------------\n+ adapter=""\n+ adapterA=""\n+ adapterB=""\n+ if adapter_file !="":\n+ try :\n+ adapter_inf=open(adapter_file,"r")\n+ if asktag=="N": #<--- directional library\n+ adapter=adapter_inf.readline()\n+ adapter_inf.close()\n+ adapter=adapter.rstrip("\\n")\n+ elif asktag=="Y":#<--- un-direc'..b'ped_strand_1, mapped_location_1, cigar1, original_BS_1, methy_1, XS_1,\n+ output_genome = output_genome_1, rnext = mapped_chr, pnext = mapped_location_2)\n+ outfile.store(header, N_mismatch_2, FR, mapped_chr, mapped_strand_2, mapped_location_2, cigar2, original_BS_2, methy_2, XS_2,\n+ output_genome = output_genome_2, rnext = mapped_chr, pnext = mapped_location_1)\n+\n+ print "--> %s %s (%d/%d) " % (read_file_1, read_file_2, no_my_files, len(my_files))\n+ #----------------------------------------------------------------\n+ #\toutput unmapped pairs\n+ #----------------------------------------------------------------\n+\n+ unmapped_lst=original_bs_reads_1.keys()\n+ unmapped_lst.sort()\n+\n+# for u in unmapped_lst:\n+# outf_u1.write("%s\\n"%(original_bs_reads_1[u]))\n+# outf_u2.write("%s\\n"%(original_bs_reads_2[u]) )\n+\n+ all_unmapped+=len(unmapped_lst)\n+\n+\n+ #==================================================================================================\n+# outf.close()\n+#\n+# outf_u1.close()\n+# outf_u2.close()\n+ delete_files(tmp_path)\n+\n+ logm("-------------------------------- " )\n+ logm("O Number of raw BS-read pairs: %d ( %d bp)"%(all_raw_reads,cut2-cut1+1) )\n+ logm("O Number of ends trimmed for adapter: %d"% all_trimmed+"\\n")\n+\n+ if all_raw_reads >0:\n+\n+ logm("O Number of unique-hits read pairs for post-filtering: %d" % all_mapped + "\\n")\n+ if asktag=="Y":\n+ logm("O -- %7d FW-RC pairs mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) )\n+ logm("O -- %7d RC-FW pairs mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[1]) )\n+ logm("O -- %7d FW-RC pairs mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[2]) )\n+ logm("O -- %7d RC-FW pairs mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[3]) )\n+ elif asktag=="N":\n+ logm("O -- %7d FW-RC pairs mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) )\n+ logm("O -- %7d FW-RC pairs mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[1]) )\n+\n+\n+ logm("O --- Number of reads rejected because of multiple hits: %d\\n" % len(Multiple_hits) )\n+ logm("O --- %d uniquely aligned pairs, where each end has mismatches <= %s"%(all_mapped_passed, max_mismatch_no) )\n+ if asktag=="Y":\n+ logm("O ----- %7d FW-RC pairs mapped to Watson strand"%(numbers_mapped_lst[0]) )\n+ logm("O ----- %7d RC-FW pairs mapped to Watson strand"%(numbers_mapped_lst[1]) )\n+ logm("O ----- %7d FW-RC pairs mapped to Crick strand"%(numbers_mapped_lst[2]) )\n+ logm("O ----- %7d RC-FW pairs mapped to Crick strand"%(numbers_mapped_lst[3]) )\n+ elif asktag=="N":\n+ logm("O ----- %7d FW-RC pairs mapped to Watson strand"%(numbers_mapped_lst[0]) )\n+ logm("O ----- %7d FW-RC pairs mapped to Crick strand"%(numbers_mapped_lst[1]) )\n+\n+ logm("O Mappability= %1.4f%%"%(100*float(all_mapped_passed)/all_raw_reads) )\n+ logm("O Unmapped read pairs: %d"% all_unmapped+"\\n")\n+\n+\n+ n_CG=mC_lst[0]+uC_lst[0]\n+ n_CHG=mC_lst[1]+uC_lst[1]\n+ n_CHH=mC_lst[2]+uC_lst[2]\n+\n+ logm("-------------------------------- " )\n+ logm("Methylated C in mapped reads " )\n+ logm(" mCG %1.3f%%"%((100*float(mC_lst[0])/n_CG) if n_CG != 0 else 0) )\n+ logm(" mCHG %1.3f%%"%((100*float(mC_lst[1])/n_CHG) if n_CHG != 0 else 0) )\n+ logm(" mCHH %1.3f%%"%((100*float(mC_lst[2])/n_CHH) if n_CHH != 0 else 0) )\n+\n+ logm("----------------------------------------------" )\n+ logm("------------------- END ----------------------" )\n+ elapsed("=== END %s %s ===" % (main_read_file_1, main_read_file_2))\n' |
b |
diff -r 000000000000 -r e6df770c0e58 BSseeker2/bs_align/bs_rrbs.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/BSseeker2/bs_align/bs_rrbs.py Fri Jul 12 18:47:28 2013 -0400 |
[ |
b'@@ -0,0 +1,1041 @@\n+import fileinput, random, math, os.path\n+from bs_index.rrbs_build import FWD_MAPPABLE_REGIONS, REV_MAPPABLE_REGIONS\n+from bs_utils.utils import *\n+\n+from bs_align.bs_single_end import extract_mapping\n+from bs_align_utils import *\n+\n+def my_mappable_region(chr_regions, mapped_location, FR): # start_position (first C), end_position (last G), serial, sequence\n+ #print len(chr_regions)\n+ out_serial = 0\n+ out_start = -1\n+ out_end = -1\n+ #print "mapped_location:", mapped_location\n+ if FR == "+FW" or FR == "-RC":\n+ my_location = str(mapped_location)\n+ if my_location in chr_regions:\n+ my_lst = chr_regions[my_location]\n+ out_start = int(my_location)\n+ out_end = my_lst[0]\n+ out_serial = my_lst[1]\n+ #else :\n+ # print "[For debug]: +FW location %s cannot be found" % my_location\n+ elif FR == "-FW" or FR == "+RC":\n+ my_location = str(mapped_location)\n+ if my_location in chr_regions:\n+ my_lst = chr_regions[my_location]\n+ out_end = int(my_location)\n+ out_start = my_lst[0]\n+ out_serial = my_lst[1]\n+ #else :\n+ # print "[For debug]: -FW location %s cannot be found" % my_location\n+\n+ return out_serial, out_start, out_end\n+\n+\n+#----------------------------------------------------------------\n+\n+def bs_rrbs(main_read_file, asktag, adapter_file, cut_s, cut_e, no_small_lines, max_mismatch_no,\n+ aligner_command, db_path, tmp_path, outfile, XS_pct, XS_count, adapter_mismatch, cut_format="C-CGG",\n+ show_multiple_hit=False):\n+ #----------------------------------------------------------------\n+ # For double enzyme: cut_format="C-CGG,A-CTG"; ApekI:"G^CWGC"\n+ #cut_context = re.sub("-", "", cut_format)\n+ # Ex. cut_format="C-CGG,AT-CG,G-CWGC"\n+ """\n+\n+ :param main_read_file:\n+ :param asktag:\n+ :param adapter_file:\n+ :param cut_s:\n+ :param cut_e:\n+ :param no_small_lines:\n+ :param max_mismatch_no:\n+ :param aligner_command:\n+ :param db_path:\n+ :param tmp_path:\n+ :param outfile:\n+ :param XS_pct:\n+ :param XS_count:\n+ :param adapter_mismatch:\n+ :param cut_format:\n+ """\n+ cut_format_lst = EnumerateIUPAC(cut_format.upper().split(",")) # [\'G-CAGC\', \'AT-CG\', \'C-CGG\', \'G-CTGC\']\n+ cut_context = [i.replace("-","") for i in cut_format_lst] # [\'GCAGC\', \'ATCG\', \'CCGG\', \'GCTGC\']\n+ cut5_context = [re.match( r\'(.*)\\-(.*)\', i).group(1) for i in cut_format_lst] # [\'G\', \'AT\', \'C\', \'G\']\n+ cut3_context = [re.match( r\'(.*)\\-(.*)\', i).group(2) for i in cut_format_lst] # [\'CAGC\', \'CG\', \'CGG\', \'CTGC\']\n+ cut_len = [len(i) for i in cut_context] # [5, 4, 4, 5]\n+ min_cut5_len = min([len(i) for i in cut5_context])\n+ #print cut_format_lst\n+ #print cut_format\n+ #print cut5_context\n+\n+ cut_tag_lst = Enumerate_C_to_CT(cut_format_lst) # [\'G-TTGC\', \'AT-TG\', \'G-CAGT\', \'T-CGG\', \'G-TAGC\', \'C-TGG\', \'G-CAGC\', \'G-CTGC\', \'AT-CG\', \'T-TGG\', \'G-TTGT\', \'G-TAGT\', \'C-CGG\', \'G-CTGT\']\n+ cut5_tag_lst = [re.match(r\'(.*)\\-(.*)\', i).group(1) for i in cut_tag_lst]\n+ cut3_tag_lst = [re.match(r\'(.*)\\-(.*)\', i).group(2) for i in cut_tag_lst]\n+ check_pattern = [ i[-2:]+"_"+j for i,j in zip(cut5_tag_lst, cut3_tag_lst) ]\n+\n+ #print "======="\n+ #print cut_tag_lst\n+ #print cut3_tag_lst\n+ #print cut5_tag_lst\n+ #print check_pattern\n+\n+ # set region[gx,gy] for checking_genome_context\n+ gx = [ 0 if j>2 else 2-j for j in [len(i) for i in cut5_tag_lst] ] # [XC-CGG]\n+ gy = [ 3+len(i) for i in cut3_tag_lst ]\n+\n+\n+ #----------------------------------------------------------------\n+\n+ # helper method to join fname with tmp_path\n+ tmp_d = lambda fname: os.path.join(tmp_path, fname)\n+ db_d = lambda fname: os.path.join(db_path, fname)\n+\n+ MAX_TRY = 500 # For finding the serial_no\n+ whole_adapter_seq = ""\n+ #----------------------------------------------------------------\n+ adap'..b'\n+ # print "[For debug]: RC_C2A read still cannot find fragment serial"\n+\n+\n+ N_mismatch = N_MIS(r_aln, g_aln)\n+ if N_mismatch <= int(max_mismatch_no) :\n+ all_mapped_passed += 1\n+ methy = methy_seq(r_aln, g_aln + next2bp)\n+ mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)\n+ #---XS FILTER----------------\n+ XS = 0\n+ nCH = methy.count(\'y\') + methy.count(\'z\')\n+ nmCH = methy.count(\'Y\') + methy.count(\'Z\')\n+ if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) :\n+ XS = 1\n+ num_mapped_RC_G2A += 1\n+ outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand,\n+ mapped_location, cigar, original_BS, methy, XS,\n+ output_genome = output_genome,\n+ rrbs = True,\n+ my_region_serial = my_region_serial,\n+ my_region_start = my_region_start,\n+ my_region_end = my_region_end)\n+ else :\n+ print "[For debug]: reads not in same lengths"\n+\n+\n+\n+ # Finished both FW and RC\n+ logm("Done: %s (%d) \\n" % (read_file, no_my_files))\n+ print "--> %s (%d) "%(read_file, no_my_files)\n+ del original_bs_reads\n+ delete_files(WC2T, CC2T, WG2A, CG2A)\n+\n+\n+\n+ # End of un-directional library\n+\n+ delete_files(tmp_path)\n+\n+\n+ logm("O Number of raw reads: %d "% all_raw_reads)\n+ if all_raw_reads >0:\n+ logm("O Number of CGG/TGG tagged reads: %d (%1.3f)"%(all_tagged,float(all_tagged)/all_raw_reads))\n+ for kk in range(len(n_cut_tag_lst)):\n+ logm("O Number of raw reads with %s tag: %d (%1.3f)"%(cut3_tag_lst[kk],n_cut_tag_lst[cut3_tag_lst[kk]],float(n_cut_tag_lst[cut3_tag_lst[kk]])/all_raw_reads))\n+ logm("O Number of CGG/TGG reads having adapter removed: %d "%all_tagged_trimmed)\n+ logm("O Number of reads rejected because of multiple hits: %d\\n" % len(Multiple_hits) )\n+ logm("O Number of unique-hits reads for post-filtering: %d"%all_mapped)\n+\n+ logm("O ------ %d uniquely aligned reads, passed fragment check, with mismatches <= %s"%(all_mapped_passed, max_mismatch_no))\n+ logm("O Mappability= %1.4f%%"%(100*float(all_mapped_passed)/all_raw_reads))\n+\n+ if asktag=="Y": # undiretional\n+ logm(" ---- %7d FW reads mapped to Watson strand"%(num_mapped_FW_C2T) )\n+ logm(" ---- %7d RC reads mapped to Watson strand"%(num_mapped_FW_G2A) )\n+ logm(" ---- %7d FW reads mapped to Crick strand"%(num_mapped_RC_C2T) )\n+ logm(" ---- %7d RC reads mapped to Crick strand"%(num_mapped_RC_G2A) )\n+ # the variable name \'num_mapped_RC_G2A\' seems not consistent with illustration\n+ # according to literal meaning\n+ elif asktag=="N": # directional\n+ logm(" ---- %7d FW reads mapped to Watson strand"%(num_mapped_FW_C2T) )\n+ logm(" ---- %7d FW reads mapped to Crick strand"%(num_mapped_RC_C2T) )\n+\n+ n_CG=mC_lst[0]+uC_lst[0]\n+ n_CHG=mC_lst[1]+uC_lst[1]\n+ n_CHH=mC_lst[2]+uC_lst[2]\n+\n+ logm("----------------------------------------------")\n+ logm("M Methylated C in mapped reads ")\n+ logm("M mCG %1.3f%%"%((100*float(mC_lst[0])/n_CG) if n_CG != 0 else 0))\n+ logm("M mCHG %1.3f%%"%((100*float(mC_lst[1])/n_CHG) if n_CHG != 0 else 0))\n+ logm("M mCHH %1.3f%%"%((100*float(mC_lst[2])/n_CHH) if n_CHH != 0 else 0))\n+ logm("----------------------------------------------")\n+ logm("------------------- END ----------------------")\n+\n+ elapsed(main_read_file)\n+\n+ close_log()\n+\n+\n' |
b |
diff -r 000000000000 -r e6df770c0e58 BSseeker2/bs_align/bs_single_end.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/BSseeker2/bs_align/bs_single_end.py Fri Jul 12 18:47:28 2013 -0400 |
[ |
b'@@ -0,0 +1,732 @@\n+import fileinput, os, time, random, math\n+from bs_utils.utils import *\n+from bs_align_utils import *\n+\n+#----------------------------------------------------------------\n+# Read from the mapped results, return lists of unique / multiple-hit reads\n+# The function suppose at most 2 hits will be reported in single file\n+def extract_mapping(ali_file):\n+ unique_hits = {}\n+ non_unique_hits = {}\n+\n+ header0 = ""\n+ lst = []\n+\n+ for header, chr, location, no_mismatch, cigar in process_aligner_output(ali_file):\n+ #------------------------------\n+ if header != header0:\n+ #---------- output -----------\n+ if len(lst) == 1:\n+ unique_hits[header0] = lst[0] # [no_mismatch, chr, location]\n+ elif len(lst) > 1:\n+ min_lst = min(lst, key = lambda x: x[0])\n+ max_lst = max(lst, key = lambda x: x[0])\n+\n+ if min_lst[0] < max_lst[0]:\n+ unique_hits[header0] = min_lst\n+ else:\n+ non_unique_hits[header0] = min_lst[0]\n+ #print "multiple hit", header, chr, location, no_mismatch, cigar # test\n+ header0 = header\n+ lst = [(no_mismatch, chr, location, cigar)]\n+ else: # header == header0, same header (read id)\n+ lst.append((no_mismatch, chr, location, cigar))\n+\n+ if len(lst) == 1:\n+ unique_hits[header0] = lst[0] # [no_mismatch, chr, location]\n+ elif len(lst) > 1:\n+ min_lst = min(lst, key = lambda x: x[0])\n+ max_lst = max(lst, key = lambda x: x[0])\n+\n+ if min_lst[0] < max_lst[0]:\n+ unique_hits[header0] = min_lst\n+ else:\n+ non_unique_hits[header0] = min_lst[0]\n+\n+\n+ return unique_hits, non_unique_hits\n+\n+\n+def bs_single_end(main_read_file, asktag, adapter_file, cut1, cut2, no_small_lines,\n+ max_mismatch_no, aligner_command, db_path, tmp_path, outfile,\n+ XS_pct, XS_count, adapter_mismatch, show_multiple_hit=False):\n+ #----------------------------------------------------------------\n+ # adapter : strand-specific or not\n+ adapter=""\n+ adapter_fw=""\n+ adapter_rc=""\n+ if adapter_file !="":\n+ try :\n+ adapter_inf=open(adapter_file,"r")\n+ except IOError:\n+ print "[Error] Cannot open adapter file : %s" % adapter_file\n+ exit(-1)\n+ if asktag == "N": #<--- directional library\n+ adapter=adapter_inf.readline()\n+ adapter_inf.close()\n+ adapter=adapter.rstrip("\\n")\n+ elif asktag == "Y":#<--- un-directional library\n+ adapter_fw=adapter_inf.readline()\n+ adapter_rc=adapter_inf.readline()\n+ adapter_inf.close()\n+ adapter_fw=adapter_fw.rstrip("\\n")\n+ adapter_rc=adapter_rc.rstrip("\\n")\n+ adapter_inf.close()\n+ #----------------------------------------------------------------\n+\n+\n+\n+ #----------------------------------------------------------------\n+ logm("Read filename: %s"% main_read_file )\n+ logm("Un-directional library: %s" % asktag )\n+ logm("The first base (for mapping): %d" % cut1)\n+ logm("The last base (for mapping): %d" % cut2)\n+ logm("Max. lines per mapping: %d"% no_small_lines)\n+ logm("Aligner: %s" % aligner_command)\n+ logm("Reference genome library path: %s" % db_path )\n+ logm("Number of mismatches allowed: %s" % max_mismatch_no )\n+ if adapter_file !="":\n+ if asktag=="N":\n+ logm("Adapter to be removed from 3\' reads: %s"%(adapter.rstrip("\\n")))\n+ elif asktag=="Y":\n+ logm("Adapter to be removed from 3\' FW reads: %s"%(adapter_fw.rstrip("\\n")) )\n+ logm("Adapter to be removed from 3\' RC reads: %s"%(adapter_rc.rstrip("\\n")) )\n+ #----------------------------------------------------------------\n+\n+ # helper method to join fname with tmp_path\n+ tmp_d = lambda fname: os.path'..b'mic_sequence(gseq[mapped_chr], mapped_location, mapped_location + g_len, mapped_strand)\n+ r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome)\n+\n+ if len(r_aln) == len(g_aln):\n+ N_mismatch = N_MIS(r_aln, g_aln) #+ original_BS_length - (r_end - r_start) # mismatches in the alignment + soft clipped nucleotides\n+ if N_mismatch <= int(max_mismatch_no):\n+ numbers_mapped_lst[nn-1] += 1\n+ all_mapped_passed += 1\n+ methy = methy_seq(r_aln, g_aln+next)\n+ mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)\n+\n+ #---XS FILTER----------------\n+ XS = 0\n+ nCH = methy.count(\'y\') + methy.count(\'z\')\n+ nmCH = methy.count(\'Y\') + methy.count(\'Z\')\n+ if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) :\n+ XS = 1\n+\n+ outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand, mapped_location, cigar, original_BS, methy, XS, output_genome = output_genome)\n+\n+ #----------------------------------------------------------------\n+ logm("--> %s (%d) "%(read_file,no_my_files))\n+ delete_files(WC2T, CC2T)\n+\n+\n+ #----------------------------------------------------------------\n+\n+# outf.close()\n+ delete_files(tmp_path)\n+\n+ logm("Number of raw reads: %d \\n"% all_raw_reads)\n+ if all_raw_reads >0:\n+ logm("Number of reads having adapter removed: %d \\n" % all_trimed )\n+ logm("Number of reads rejected because of multiple hits: %d\\n" % len(Multiple_hits) )\n+ logm("Number of unique-hits reads for post-filtering: %d\\n" % all_mapped)\n+ if asktag=="Y":\n+ logm(" ---- %7d FW reads mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) )\n+ logm(" ---- %7d RC reads mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[1]) )\n+ logm(" ---- %7d FW reads mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[2]) )\n+ logm(" ---- %7d RC reads mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[3]) )\n+ elif asktag=="N":\n+ logm(" ---- %7d FW reads mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) )\n+ logm(" ---- %7d FW reads mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[1]) )\n+\n+ logm("Post-filtering %d uniquely aligned reads with mismatches <= %s"%(all_mapped_passed, max_mismatch_no) )\n+ if asktag=="Y":\n+ logm(" ---- %7d FW reads mapped to Watson strand"%(numbers_mapped_lst[0]) )\n+ logm(" ---- %7d RC reads mapped to Watson strand"%(numbers_mapped_lst[1]) )\n+ logm(" ---- %7d FW reads mapped to Crick strand"%(numbers_mapped_lst[2]) )\n+ logm(" ---- %7d RC reads mapped to Crick strand"%(numbers_mapped_lst[3]) )\n+ elif asktag=="N":\n+ logm(" ---- %7d FW reads mapped to Watson strand"%(numbers_mapped_lst[0]) )\n+ logm(" ---- %7d FW reads mapped to Crick strand"%(numbers_mapped_lst[1]) )\n+ logm("Mappability= %1.4f%%"%(100*float(all_mapped_passed)/all_raw_reads) )\n+\n+ n_CG=mC_lst[0]+uC_lst[0]\n+ n_CHG=mC_lst[1]+uC_lst[1]\n+ n_CHH=mC_lst[2]+uC_lst[2]\n+\n+ logm("----------------------------------------------" )\n+ logm("Methylated C in mapped reads ")\n+\n+ logm(" mCG %1.3f%%"%((100*float(mC_lst[0])/n_CG) if n_CG != 0 else 0))\n+ logm(" mCHG %1.3f%%"%((100*float(mC_lst[1])/n_CHG) if n_CHG != 0 else 0))\n+ logm(" mCHH %1.3f%%"%((100*float(mC_lst[2])/n_CHH) if n_CHH != 0 else 0))\n+\n+ logm("------------------- END --------------------" )\n+ elapsed("=== END %s ===" % main_read_file)\n+\n+\n+\n' |
b |
diff -r 000000000000 -r e6df770c0e58 BSseeker2/bs_align/output.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/BSseeker2/bs_align/output.py Fri Jul 12 18:47:28 2013 -0400 |
[ |
@@ -0,0 +1,83 @@ +try : + import pysam +except ImportError : + print "[Error] It seems that you haven't install \"pysam\" package.. Please do it before you run this script." + exit(-1) + +import sys +from bs_align_utils import * + +BAM = 'bam' +SAM = 'sam' +BS_SEEKER1 = 'bs_seeker1' + +formats = [BAM, SAM, BS_SEEKER1] + +class outfile: + def __init__(self, filename, format, chrom_len, cmd_line, suppress_SAM_header): + self.filename = filename + self.format = format + self.chrom_ids = dict((k, i) for i, k in enumerate(sorted(chrom_len))) + + if format == BS_SEEKER1: + self.f = open(filename, 'w') + elif format in [SAM, BAM]: + header = { 'HD' : { 'VN': '1.0'}, + 'SQ' : [ {'LN' : chrom_len[c], 'SN' : c} for c in sorted(chrom_len) ], + 'PG' : [ { 'ID' : 1, 'PN' : 'BS Seeker 2', 'CL' : cmd_line} ] + } + self.f = pysam.Samfile(filename, 'w' + ('b' if format == BAM else ('' if suppress_SAM_header else 'h')), header = header) + + + + def close(self): + self.f.close() + + def store(self, qname, N_mismatch, FR, refname, strand, pos, cigar, original_BS, methy, STEVE, rnext = -1, pnext = -1, qual = None, output_genome = None, + rrbs = False, my_region_serial = -1, my_region_start = 0, my_region_end = 0): + + if self.format == BS_SEEKER1: + + # remove the soft clipped bases from the read + # this is done for backwards compatibility with the old format + r_start, r_end, _ = get_read_start_end_and_genome_length(cigar) + original_BS = original_BS[r_start : r_end] + + if rrbs: + self.f.write('%s\t%2d\t%s\t%s%s%s\t%s\t%s\t%s\t%d\n' % (qname, N_mismatch, FR, refname, strand, str(pos+1).zfill(10), output_genome, original_BS, methy, STEVE)) + else: + self.f.write('%s\t%2d\t%s\t%s%s%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\n' % (qname, N_mismatch, FR, refname, strand, str(pos+1).zfill(10), output_genome, original_BS, methy, my_region_serial, my_region_start, my_region_end, STEVE)) + + + elif self.format == BAM or self.format == SAM: + + a = pysam.AlignedRead() + a.qname = qname + a.seq = original_BS if strand == '+' else reverse_compl_seq(original_BS) + a.flag = 0x10 if strand == '-' else 0 + a.tid = self.chrom_ids[refname] + a.pos = pos + a.mapq = 255 + a.cigar = cigar if strand == '+' else list(reversed(cigar)) + a.rnext = rnext if rnext == -1 else self.chrom_ids[rnext] + a.pnext = pnext + a.qual= qual + if rrbs: + a.tags = (('XO', FR), + ('XS', STEVE), + ('NM', N_mismatch), + ('XM', methy), + ('XG', output_genome), + ('YR', my_region_serial), + ('YS', my_region_start), + ('YE', my_region_end) + ) + + else: + a.tags = (('XO', FR), + ('XS', STEVE), + ('NM', N_mismatch), + ('XM', methy), + ('XG', output_genome)) + + self.f.write(a) |
b |
diff -r 000000000000 -r e6df770c0e58 BSseeker2/bs_index/.___init__.py |
b |
Binary file BSseeker2/bs_index/.___init__.py has changed |
b |
diff -r 000000000000 -r e6df770c0e58 BSseeker2/bs_index/._rrbs_build.py |
b |
Binary file BSseeker2/bs_index/._rrbs_build.py has changed |
b |
diff -r 000000000000 -r e6df770c0e58 BSseeker2/bs_index/._wg_build.py |
b |
Binary file BSseeker2/bs_index/._wg_build.py has changed |
b |
diff -r 000000000000 -r e6df770c0e58 BSseeker2/bs_index/__init__.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/BSseeker2/bs_index/__init__.py Fri Jul 12 18:47:28 2013 -0400 |
b |
@@ -0,0 +1,1 @@ +__author__ = 'pf' |
b |
diff -r 000000000000 -r e6df770c0e58 BSseeker2/bs_index/rrbs_build.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/BSseeker2/bs_index/rrbs_build.py Fri Jul 12 18:47:28 2013 -0400 |
[ |
@@ -0,0 +1,194 @@ +import os + +from bs_utils.utils import * + + +FWD_MAPPABLE_REGIONS = lambda chrom_id: chrom_id+'.fwd_mappable_regions' +REV_MAPPABLE_REGIONS = lambda chrom_id: chrom_id+'.rev_mappable_regions' + + +# example: T-CGA + +def rrbs_build(fasta_file, build_command, ref_path, low_bound, up_bound, aligner, cut_format="C-CGG"): + # ref_path is a string that contains the directory where the reference genomes are stored with + # the input fasta filename appended + + cut_format = cut_format.upper() # Ex. "C-CGG,C-TAG"; MspI&ApekI:"G^CWGC" + if cut_format == "C-CGG" : + ref_path = os.path.join(ref_path, + os.path.split(fasta_file)[1] + '_rrbs_%d_%d' % (low_bound, up_bound) +'_' + aligner) + else : + ref_path = os.path.join(ref_path, + os.path.split(fasta_file)[1] + '_rrbs_%s_%d_%d' % ( cut_format.replace("-","").replace(",","-"), low_bound, up_bound) +'_' + aligner) + + ref_p = lambda filename: os.path.join(ref_path, filename) + + clear_dir(ref_path) + open_log(ref_p('log')) + + refd = {} + w_c2t = open(ref_p('W_C2T.fa'),'w') + c_c2t = open(ref_p('C_C2T.fa'),'w') + + w_g2a = open(ref_p('W_G2A.fa'),'w') + c_g2a = open(ref_p('C_G2A.fa'),'w') + + mappable_regions_output_file = open(ref_p("RRBS_mappable_regions.txt"),"w") + + all_L = 0 + all_mappable_length = 0 + all_unmappable_length = 0 + + no_mappable_region = 0 + total_chromosomes = 0 + +# cut_context = re.sub("-", "", cut_format).split(",") + cut_context = EnumerateIUPAC(cut_format.replace("-","").split(",")) + cut_len = [len(i) for i in cut_context] + cut_len_max = max(cut_len) + + + for chrom_id, chrom_seq in read_fasta(fasta_file): + total_chromosomes += 1 + refd[chrom_id] = len(chrom_seq) + + fwd_chr_regions = {} + rev_chr_regions = {} + + L = len(chrom_seq) + XXXX_sites = [] + XXXX_XXXX = [] + + #-- collect all "XXXX sites --------------------------------- + i = 1 + while i <= L - cut_len_max: + j = 0 + while j < len(cut_len) : + if chrom_seq[i : i + cut_len[j]] == cut_context[j]: + XXXX_sites.append( (i, i + cut_len[j] - 1) ) # add (1st position, last position) + j += 1 + i += 1 + + #-- find "XXXX" pairs that are within the length of fragment ---- + for j in xrange(len(XXXX_sites) - 1): + DD = (XXXX_sites[j+1][0] - XXXX_sites[j][1]) - 1 # NOT including both XXXX; DD: fragment length + if low_bound <= DD <= up_bound: + XXXX_XXXX.append([XXXX_sites[j][0], XXXX_sites[j+1][1]]) # leftmost <--> rightmost + mappable_seq = chrom_seq[XXXX_sites[j][0] : XXXX_sites[j+1][1] + 1] + no_mappable_region += 1 + + fwd_chr_regions[str(XXXX_sites[j][0])] = [XXXX_sites[j+1][1], no_mappable_region] + rev_chr_regions[str(XXXX_sites[j+1][1])] = [XXXX_sites[j][0], no_mappable_region] + + # start_position, end_position, serial, sequence + mappable_regions_output_file.write("%s\t%d\t%d\t%d\t%s\n"%(chrom_id, no_mappable_region, + XXXX_sites[j][0], XXXX_sites[j+1][1], mappable_seq)) + # storing region information to file + # format: A[left_CCGG_pos]=[right_CCGG_pos, number_of_mappable_region] + serialize(fwd_chr_regions, ref_p(FWD_MAPPABLE_REGIONS(chrom_id))) + serialize(rev_chr_regions, ref_p(REV_MAPPABLE_REGIONS(chrom_id))) + + #----------------------------------- + # mask the genome + _map_seq = [] + mappable_length = 0 + unmappable_length = 0 + m = 0 + mark = False + while m < L: # for every nucleotide in chr + if len(XXXX_XXXX) > 0: + pair = XXXX_XXXX[0] + p1 = pair[0] # left end of fragment + p2 = pair[1] # right end of fragment + if p1 <= m < p2 + 1 : + _map_seq.append(chrom_seq[m]) + mappable_length+=1 + mark = True + else : + if not mark: + _map_seq.append("-") + unmappable_length += 1 + else: # the last eligible base + _ = XXXX_XXXX.pop(0) + if len(XXXX_XXXX)>0: + pair = XXXX_XXXX[0] + p1 = pair[0] + p2 = pair[1] + if p1 <= m < p2 + 1: + _map_seq.append(chrom_seq[m]) + mappable_length += 1 + mark = True + else: + _map_seq.append("-") + unmappable_length += 1 + mark = False + else: + _map_seq.append("-") + unmappable_length+=1 + mark = False + else: + if not mark: + _map_seq.append("-") + unmappable_length+=1 + else: + _map_seq.append("-") + unmappable_length+=1 + mark = False + + m+=1 + + #----------------------------------- + + chrom_seq = ''.join(_map_seq) + serialize(chrom_seq, ref_p(chrom_id)) + + w_c2t.write('>%s\n%s\n' % (chrom_id, chrom_seq.replace("C","T"))) + w_g2a.write('>%s\n%s\n' % (chrom_id, chrom_seq.replace("G","A"))) + + chrom_seq = reverse_compl_seq(chrom_seq) + + c_c2t.write('>%s\n%s\n' % (chrom_id, chrom_seq.replace("C","T"))) + c_g2a.write('>%s\n%s\n' % (chrom_id, chrom_seq.replace("G","A"))) + + #----------------------------------- + logm("# %s : all (%d) : unmappable (%d) : mappable (%d) : ratio (%1.5f)"%(chrom_id, + L, + unmappable_length, + mappable_length, + float(mappable_length)/L) ) + all_L += L + all_mappable_length += mappable_length + all_unmappable_length += unmappable_length + + elapsed('Finished initial pre-processing of ' + chrom_id) + + + for outf in [w_c2t, c_c2t, w_g2a, c_g2a]: + outf.close() + + + logm("# Total %d chromosome(s) : all (%d) : unmappable (%d) : mappable (%d) : ratio (%1.5f)" %(total_chromosomes, + all_L, + all_unmappable_length, + all_mappable_length, + float(all_mappable_length)/all_L) ) + logm("# eligible fragments : %d" % no_mappable_region ) + + serialize(refd, ref_p("refname")) + + mappable_regions_output_file.close() + elapsed('Storing mappable regions and genome') + + #---------------- bowtie-build ------------------------------------------- + + # append ref_path to all elements of to_bowtie + to_bowtie = map(lambda f: os.path.join(ref_path, f), ['W_C2T', 'W_G2A', 'C_C2T', 'C_G2A']) + + run_in_parallel([(build_command % { 'fname' : fname }, fname + '.log') for fname in to_bowtie]) + + elapsed('Index building') + # delete all fasta files + delete_files( f +'.fa' for f in to_bowtie) + + elapsed('END') + |
b |
diff -r 000000000000 -r e6df770c0e58 BSseeker2/bs_index/wg_build.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/BSseeker2/bs_index/wg_build.py Fri Jul 12 18:47:28 2013 -0400 |
[ |
@@ -0,0 +1,55 @@ +from bs_utils.utils import * + + +def wg_build(fasta_file, build_command, ref_path, aligner): + + # ref_path is a string that containts the directory where the reference genomes are stored with + # the input fasta filename appended + ref_path = os.path.join(ref_path, + os.path.split(fasta_file)[1] + '_'+aligner) + + clear_dir(ref_path) + #--------------------------------------------------------------- + # 1. First get the complementary genome (also do the reverse) + # 2. Then do CT and GA conversions + #--------------------------------------------------------------- + + open_log(os.path.join(ref_path, 'log')) + refd = {} + w_c2t = open(os.path.join(ref_path, 'W_C2T.fa'),'w') + c_c2t = open(os.path.join(ref_path, 'C_C2T.fa'),'w') + + w_g2a = open(os.path.join(ref_path, 'W_G2A.fa'),'w') + c_g2a = open(os.path.join(ref_path, 'C_G2A.fa'),'w') + for chrom_id, chrom_seq in read_fasta(fasta_file): + serialize(chrom_seq, os.path.join(ref_path, chrom_id)) + refd[chrom_id] = len(chrom_seq) + + w_c2t.write('>%s\n%s\n' % (chrom_id, chrom_seq.replace("C","T"))) + w_g2a.write('>%s\n%s\n' % (chrom_id, chrom_seq.replace("G","A"))) + + chrom_seq = reverse_compl_seq(chrom_seq) + + c_c2t.write('>%s\n%s\n' % (chrom_id, chrom_seq.replace("C","T"))) + c_g2a.write('>%s\n%s\n' % (chrom_id, chrom_seq.replace("G","A"))) + + elapsed('Preprocessing '+chrom_id) + + for outf in [w_c2t, c_c2t, w_g2a, c_g2a]: + outf.close() + + serialize(refd, os.path.join(ref_path,"refname")) + elapsed('Genome preprocessing') + # append ref_path to all elements of to_bowtie + to_bowtie = map(lambda f: os.path.join(ref_path, f), ['W_C2T', 'W_G2A', 'C_C2T', 'C_G2A']) + + # start bowtie-build for all converted genomes and wait for the processes to finish + + run_in_parallel([(build_command % { 'fname' : fname }, fname+'.log') for fname in to_bowtie]) + + # delete fasta files of converted genomes + if aligner != "rmap" : + delete_files(f+'.fa' for f in to_bowtie) + + elapsed('Done') + |
b |
diff -r 000000000000 -r e6df770c0e58 BSseeker2/bs_seeker2-align.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/BSseeker2/bs_seeker2-align.py Fri Jul 12 18:47:28 2013 -0400 |
[ |
b'@@ -0,0 +1,349 @@\n+#!/usr/bin/python\n+\n+from optparse import OptionParser, OptionGroup\n+import re\n+import tempfile\n+from bs_align import output\n+from bs_align.bs_pair_end import *\n+from bs_align.bs_single_end import *\n+from bs_align.bs_rrbs import *\n+from bs_utils.utils import *\n+\n+\n+if __name__ == \'__main__\':\n+\n+ parser = OptionParser()\n+ # option group 1\n+ opt_group = OptionGroup(parser, "For single end reads")\n+ opt_group.add_option("-i", "--input", type="string", dest="infilename",help="Input your read file name (FORMAT: sequences, fastq, qseq,fasta)", metavar="INFILE")\n+ parser.add_option_group(opt_group)\n+\n+ # option group 2\n+ opt_group = OptionGroup(parser, "For pair end reads")\n+ opt_group.add_option("-1", "--input_1", type="string", dest="infilename_1",help="Input your read file end 1 (FORMAT: sequences, qseq, fasta, fastq)", metavar="FILE")\n+ opt_group.add_option("-2", "--input_2", type="string", dest="infilename_2",help="Input your read file end 2 (FORMAT: sequences, qseq, fasta, fastq)", metavar="FILE")\n+ opt_group.add_option("--minins",type = "int",dest = "min_insert_size", help="The minimum insert size for valid paired-end alignments [Default: %default]", default = -1)\n+ opt_group.add_option("--maxins",type = "int",dest = "max_insert_size", help="The maximum insert size for valid paired-end alignments [Default: %default]", default = 400)\n+ parser.add_option_group(opt_group)\n+\n+ # option group 3\n+ opt_group = OptionGroup(parser, "Reduced Representation Bisulfite Sequencing Options")\n+ opt_group.add_option("-r", "--rrbs", action="store_true", dest="rrbs", default = False, help = \'Process reads from Reduced Representation Bisulfite Sequencing experiments\')\n+ opt_group.add_option("-c", "--cut-site", type="string",dest="cut_format", help="Cutting sites of restriction enzyme. Ex: MspI(C-CGG), Mael:(C-TAG), double-enzyme MspI&Mael:(C-CGG,C-TAG). [Default: %default]", metavar="pattern", default = "C-CGG")\n+ opt_group.add_option("-L", "--low", type = "int", dest="rrbs_low_bound", help="lower bound of fragment length (excluding C-CGG ends) [Default: %default]", default = 40)\n+ opt_group.add_option("-U", "--up", type = "int", dest="rrbs_up_bound", help="upper bound of fragment length (excluding C-CGG ends) [Default: %default]", default = 500)\n+ parser.add_option_group(opt_group)\n+\n+ # option group 4\n+ opt_group = OptionGroup(parser, "General options")\n+ opt_group.add_option("-t", "--tag", type="string", dest="taginfo",help="[Y]es for undirectional lib, [N]o for directional [Default: %default]", metavar="TAG", default = \'N\')\n+ opt_group.add_option("-s","--start_base",type = "int",dest = "cutnumber1", help="The first base of your read to be mapped [Default: %default]", default = 1)\n+ opt_group.add_option("-e","--end_base",type = "int",dest = "cutnumber2", help="The last cycle number of your read to be mapped [Default: %default]", default = 200)\n+ opt_group.add_option("-a", "--adapter", type="string", dest="adapter_file",help="Input text file of your adaptor sequences (to be trimed from the 3\'end of the reads). Input 1 seq for dir. lib., 2 seqs for undir. lib. One line per sequence", metavar="FILE", default = \'\')\n+ opt_group.add_option("--am",type = "int",dest = "adapter_mismatch", help="Number of mismatches allowed in adaptor [Default: %default]", default = 0)\n+ opt_group.add_option("-g", "--genome", type="string", dest="genome",help="Name of the reference genome (the same as the reference genome file in the preprocessing step) [ex. chr21_hg18.fa]")\n+ opt_group.add_option("-m", "--mismatches",type = "int", dest="int_no_mismatches",help="Number of mismatches in one read [Default: %default]", default = 4)\n+ opt_group.add_option("--aligner", dest="aligner",help="Aligner program to perform the analisys: " + \', \'.join(supported_aligners) + " [Default: %default]", metavar="ALIGNER", default = BOWTIE2)\n+ opt_group.add_option("-p", "--path", de'..b" options.adapter_file,\n+ options.cutnumber1,\n+ options.cutnumber2,\n+ options.no_split,\n+ str_no_mismatches,\n+ aligner_command,\n+ db_path,\n+ tmp_path,\n+ outfile,\n+ XS_pct,\n+ XS_count,\n+ options.adapter_mismatch,\n+ options.cut_format,\n+ options.Output_multiple_hit\n+ )\n+ else: # Normal single end scan\n+ bs_single_end( options.infilename,\n+ asktag,\n+ options.adapter_file,\n+ options.cutnumber1,\n+ options.cutnumber2,\n+ options.no_split,\n+ str_no_mismatches,\n+ aligner_command,\n+ db_path,\n+ tmp_path,\n+ outfile,\n+ XS_pct,\n+ XS_count,\n+ options.adapter_mismatch,\n+ options.Output_multiple_hit\n+ )\n+ else:\n+ logm('Pair end')\n+ # pair end specific default options\n+ aligner_options = dict({BOWTIE: {'--ff' : asktag == 'N',\n+ '--fr' : asktag == 'Y',\n+ '-X' : options.max_insert_size,\n+ '-I' : options.min_insert_size if options.min_insert_size > 0 else None\n+ },\n+ BOWTIE2 : {\n+ '--ff' : asktag == 'N',\n+ '--fr' : asktag == 'Y',\n+ '-X' : options.max_insert_size,\n+ '-I' : options.min_insert_size if options.min_insert_size > 0 else None,\n+ '--no-discordant' : True,\n+ '--no-mixed' : True\n+ },\n+ SOAP: {\n+ '-x' : options.max_insert_size,\n+ '-m' : options.min_insert_size if options.min_insert_size > 0 else 100\n+ }}[options.aligner],\n+ # integrating 'rmappe' is different from others\n+ **aligner_options)\n+\n+ aligner_command = aligner_exec + aligner_options_string() + \\\n+ { BOWTIE : ' %(reference_genome)s -f -1 %(input_file_1)s -2 %(input_file_2)s %(output_file)s',\n+ BOWTIE2 : ' -x %(reference_genome)s -f -1 %(input_file_1)s -2 %(input_file_2)s -S %(output_file)s',\n+ SOAP : ' -D %(reference_genome)s.fa.index -o %(output_file)s -a %(input_file_1)s -b %(input_file_2)s -2 %(output_file)s.unpaired' #,\n+ # RMAP : # rmappe, also paste two inputs into one file.\n+ }[options.aligner]\n+\n+ logm('Aligner command: %s' % aligner_command)\n+\n+ bs_pair_end(options.infilename_1,\n+ options.infilename_2,\n+ asktag,\n+ options.adapter_file,\n+ options.cutnumber1,\n+ options.cutnumber2,\n+ options.no_split,\n+ str_no_mismatches,\n+ aligner_command,\n+ db_path,\n+ tmp_path,\n+ outfile,\n+ XS_pct,\n+ XS_count,\n+ options.Output_multiple_hit\n+ )\n+\n+ outfile.close()\n+\n" |
b |
diff -r 000000000000 -r e6df770c0e58 BSseeker2/bs_seeker2-build.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/BSseeker2/bs_seeker2-build.py Fri Jul 12 18:47:28 2013 -0400 |
[ |
@@ -0,0 +1,87 @@ +#!/usr/bin/python + +import os +from optparse import OptionParser, OptionGroup +from bs_index.wg_build import * +from bs_index.rrbs_build import * +from bs_utils.utils import * + + +if __name__ == '__main__': + + parser = OptionParser() + + parser.add_option("-f", "--file", dest="filename", help="Input your reference genome file (fasta)", metavar="FILE") + parser.add_option("--aligner", dest="aligner", help="Aligner program to perform the analysis: " + ', '.join(supported_aligners) + " [Default: %default]", metavar="ALIGNER", default = BOWTIE2) + parser.add_option("-p", "--path", dest="aligner_path", help="Path to the aligner program. Defaults: " +' '*70+ '\t'.join(('%s: %s '+' '*70) % (al, aligner_path[al]) for al in sorted(supported_aligners)), + metavar="PATH") + parser.add_option("-d", "--db", type="string", dest="dbpath", help="Path to the reference genome library (generated in preprocessing genome) [Default: %default]", metavar="DBPATH", default = reference_genome_path) + + parser.add_option("-v", "--version", action="store_true", dest="version", help="show version of BS-Seeker2", default=False) + + # RRBS options + rrbs_opts = OptionGroup(parser, "Reduced Representation Bisulfite Sequencing Options", + "Use this options with conjuction of -r [--rrbs]") + rrbs_opts.add_option("-r", "--rrbs", action="store_true", dest="rrbs", help = 'Build index specially for Reduced Representation Bisulfite Sequencing experiments. Genome other than certain fragments will be masked. [Default: %default]', default = False) + rrbs_opts.add_option("-l", "--low",type= "int", dest="low_bound", help="lower bound of fragment length (excluding recognition sequence such as C-CGG) [Default: %default]", default = 40) + rrbs_opts.add_option("-u", "--up", type= "int", dest="up_bound", help="upper bound of fragment length (excluding recognition sequence such as C-CGG ends) [Default: %default]", default = 500) + rrbs_opts.add_option("-c", "--cut-site", type= "string", dest="cut_format", help="Cut sites of restriction enzyme. Ex: MspI(C-CGG), Mael:(C-TAG), double-enzyme MspI&Mael:(C-CGG,C-TAG). [Default: %default]", default = "C-CGG") + parser.add_option_group(rrbs_opts) + + + (options, args) = parser.parse_args() + + # if no options were given by the user, print help and exit + if len(sys.argv) == 1: + print parser.print_help() + exit(0) + + if options.version : + show_version() + exit (-1) + else : + show_version() + + rrbs = options.rrbs + + fasta_file=os.path.expanduser(options.filename) + if fasta_file is None: + error('Fasta file for the reference genome must be supported') + + if not os.path.isfile(fasta_file): + error('%s cannot be found' % fasta_file) + + if options.aligner not in supported_aligners: + error('-a option should be: %s' % ' ,'.join(supported_aligners)+'.') + + builder_exec = os.path.join(options.aligner_path or aligner_path[options.aligner], + {BOWTIE : 'bowtie-build', + BOWTIE2 : 'bowtie2-build', + SOAP : '2bwt-builder', + RMAP : '' # do nothing + }[options.aligner]) + + build_command = builder_exec + { BOWTIE : ' -f %(fname)s.fa %(fname)s', + BOWTIE2 : ' -f %(fname)s.fa %(fname)s', + SOAP : ' %(fname)s.fa' + }[options.aligner] + + + print "Reference genome file: %s" % fasta_file + print "Reduced Representation Bisulfite Sequencing: %s" % rrbs + print "Builder path: %s" % builder_exec + #--------------------------------------------------------------- + + ref_path = options.dbpath + + if os.path.exists(ref_path): + if not os.path.isdir(ref_path): + error("%s must be a directory. Please, delete it or change the -d option." % ref_path) + else: + os.mkdir(ref_path) + + if rrbs: # RRBS preprocessing + rrbs_build(fasta_file, build_command, ref_path, options.low_bound, options.up_bound, options.aligner, options.cut_format) + else: # Whole genome preprocessing + wg_build(fasta_file, build_command, ref_path, options.aligner) + |
b |
diff -r 000000000000 -r e6df770c0e58 BSseeker2/bs_seeker2-call_methylation.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/BSseeker2/bs_seeker2-call_methylation.py Fri Jul 12 18:47:28 2013 -0400 |
[ |
b'@@ -0,0 +1,205 @@\n+#!/usr/bin/python\n+\n+from optparse import OptionParser, OptionGroup\n+from bs_utils.utils import *\n+\n+try :\n+ import pysam\n+except ImportError :\n+ print "[Error] Cannot import \\"pysam\\" package. Have you installed it?"\n+ exit(-1)\n+\n+import gzip\n+\n+def context_calling(seq, position):\n+\n+ word=seq[position]\n+ word=word.upper()\n+\n+ context="--"\n+ context_CH="--"\n+ if position + 2 < len(seq) and position - 2 >= 0:\n+\n+ if word == "C":\n+ word2 = seq[position+1]\n+ context_CH = word + word2\n+ if word2 == "G":\n+ context = "CG"\n+ elif word2 in [\'A\',\'C\',\'T\']:\n+ word3 = seq[position+2]\n+ if word3 == "G":\n+ context = "CHG"\n+ elif word3 in [\'A\',\'C\',\'T\']:\n+ context="CHH"\n+\n+ elif word == "G":\n+ word2 = seq[position-1]\n+ context_CH = word + word2\n+ context_CH = context_CH.translate(string.maketrans("ATCG", "TAGC"))\n+ if word2 == "C":\n+ context = "CG"\n+ elif word2 in [\'A\',\'G\',\'T\']:\n+ word3 = seq[position-2]\n+ if word3 == "C":\n+ context = "CHG"\n+ elif word3 in [\'A\',\'G\',\'T\']:\n+ context = "CHH"\n+\n+ return word, context, context_CH\n+\n+\n+\n+if __name__ == \'__main__\':\n+\n+ parser = OptionParser()\n+ parser.add_option("-i", "--input", type="string", dest="infilename",help="BAM output from bs_seeker2-align.py", metavar="INFILE")\n+ parser.add_option("-d", "--db", type="string", dest="dbpath",help="Path to the reference genome library (generated in preprocessing genome) [Default: %default]" , metavar="DBPATH", default = reference_genome_path)\n+ parser.add_option("-o", "--output-prefix", type="string", dest="output_prefix",help="The output prefix to create ATCGmap and wiggle files [INFILE]", metavar="OUTFILE")\n+\n+ parser.add_option("--wig", type="string", dest="wig_file",help="The output .wig file [INFILE.wig]", metavar="OUTFILE")\n+ parser.add_option("--CGmap", type="string", dest="CGmap_file",help="The output .CGmap file [INFILE.CGmap]", metavar="OUTFILE")\n+ parser.add_option("--ATCGmap", type="string", dest="ATCGmap_file",help="The output .ATCGmap file [INFILE.ATCGmap]", metavar="OUTFILE")\n+\n+ parser.add_option("-x", "--rm-SX", action="store_true", dest="RM_SX",help="Removed reads with tag \\\'XS:i:1\\\', which would be considered as not fully converted by bisulfite treatment [Default: %default]", default = False)\n+ parser.add_option("--txt", action="store_true", dest="text",help="Show CGmap and ATCGmap in .gz [Default: %default]", default = False)\n+\n+ parser.add_option("-r", "--read-no",type = "int", dest="read_no",help="The least number of reads covering one site to be shown in wig file [Default: %default]", default = 1)\n+ parser.add_option("-v", "--version", action="store_true", dest="version",help="show version of BS-Seeker2", metavar="version", default = False)\n+\n+ (options, args) = parser.parse_args()\n+\n+\n+ # if no options were given by the user, print help and exit\n+ if len(sys.argv) == 1:\n+ print parser.print_help()\n+ exit(0)\n+\n+ if options.version :\n+ show_version()\n+ exit (-1)\n+ else :\n+ show_version()\n+\n+\n+ if options.infilename is None:\n+ error(\'-i option is required\')\n+ if not os.path.isfile(options.infilename):\n+ error(\'Cannot find input file: %s\' % options.infilename)\n+\n+ open_log(options.infilename+\'.call_methylation_log\')\n+ db_d = lambda fname: os.path.join( os.path.expanduser(options.dbpath), fname) # bug fixed, weilong\n+\n+ logm(\'sorting BS-Seeker alignments\')\n+ sorted_input_filename = options.infilename+\'_sorted\'\n+ pysam.sort(options.infilename, sorted_input_filename)\n+ sorted_input_filename += \'.bam\'\n+ logm(\'indexing sorted alignments\')\n+ pysam.index(sorted_input_fi'..b'tions.infilename) + \'.ATCGmap.gz\')\n+ ATCGmap = gzip.open(ATCGmap_fname, \'wb\')\n+\n+ CGmap_fname = options.CGmap_file or ((options.output_prefix or options.infilename) + \'.CGmap.gz\')\n+ CGmap = gzip.open(CGmap_fname, \'wb\')\n+\n+ wiggle_fname = options.wig_file or ((options.output_prefix or options.infilename) + \'.wig\')\n+ wiggle = open(wiggle_fname, \'w\')\n+\n+ sorted_input = pysam.Samfile(sorted_input_filename, \'rb\')\n+ \n+ chrom = None\n+ nucs = [\'A\', \'T\', \'C\', \'G\', \'N\']\n+ ATCG_fwd = dict((n, 0) for n in nucs)\n+ ATCG_rev = dict((n, 0) for n in nucs)\n+ for col in sorted_input.pileup():\n+ col_chrom = sorted_input.getrname(col.tid)\n+ if chrom != col_chrom:\n+ chrom = col_chrom\n+ chrom_seq = deserialize(db_d(chrom))\n+ wiggle.write(\'variableStep chrom=%s\\n\' % chrom)\n+\n+ for n in nucs:\n+ ATCG_fwd[n] = 0\n+ ATCG_rev[n] = 0\n+\n+ nuc, context, subcontext = context_calling(chrom_seq, col.pos)\n+ total_reads = 0\n+\n+\n+\n+ for pr in col.pileups:\n+ # print pr\n+ if (not pr.indel) : # skip indels\n+ #if ( (options.RM_SX) and (pr.alignment.tags[1][1] == 1) ):\n+ ##=== Fixed error reported by Roberto\n+ #print options.RM_SX, dict(pr.alignment.tags)["XS"]\n+ #if ( (options.RM_SX) and (dict(pr.alignment.tags)["XS"] == 1) ):\n+\n+ if ( (options.RM_SX) and (dict(pr.alignment.tags).get("XS",0) == 1) ):\n+ # print "Debug: ", options.RM_SX, pr.alignment.tags[1]\n+ # when need to filter and read with tag (XS==1), then remove the reads\n+ continue\n+\n+ if pr.qpos >= len(pr.alignment.seq):\n+ print \'WARNING: read %s has an invalid alignment. Discarding.. \' % pr.alignment.qname\n+ continue\n+ read_nuc = pr.alignment.seq[pr.qpos]\n+ # print "read_nuc=", read_nuc\n+ if pr.alignment.is_reverse:\n+ ATCG_rev[read_nuc] += 1\n+ else:\n+ ATCG_fwd[read_nuc] += 1\n+\n+ if read_nuc != \'N\':\n+ total_reads += 1\n+\n+ cnts = lambda d: \'\\t\'.join(str(d[n]) for n in nucs)\n+ fwd_counts = cnts(ATCG_fwd)\n+ rev_counts = cnts(ATCG_rev)\n+\n+ meth_level = None\n+ meth_cytosines = 0\n+ unmeth_cytosines = 0\n+\n+ if nuc == \'C\':\n+ # plus strand: take the ratio of C\'s to T\'s from reads that come from the forward strand\n+ meth_cytosines = ATCG_fwd[\'C\']\n+ unmeth_cytosines = ATCG_fwd[\'T\']\n+\n+ elif nuc == \'G\':\n+ # minus strand: take the ratio of G\'s to A\'s from reads that come from the reverse strand\n+ meth_cytosines = ATCG_rev[\'G\']\n+ unmeth_cytosines = ATCG_rev[\'A\']\n+\n+ if meth_cytosines + unmeth_cytosines > 0:\n+ meth_level = float(meth_cytosines)/(meth_cytosines + unmeth_cytosines)\n+\n+ pos = col.pos + 1\n+\n+ meth_level_string = str(meth_level) if meth_level is not None else \'na\'\n+ ATCGmap.write(\'%(chrom)s\\t%(nuc)s\\t%(pos)d\\t%(context)s\\t%(subcontext)s\\t%(fwd_counts)s\\t%(rev_counts)s\\t%(meth_level_string)s\\n\' % locals())\n+#\n+ all_cytosines = meth_cytosines + unmeth_cytosines \n+ if (meth_level is not None) and (all_cytosines >= options.read_no):\n+ # print all_cytosines\n+ if nuc == \'C\':\n+ wiggle.write(\'%d\\t%f\\n\' % (pos, meth_level))\n+ else :\n+ wiggle.write(\'%d\\t-%f\\n\' % (pos, meth_level))\n+ CGmap.write(\'%(chrom)s\\t%(nuc)s\\t%(pos)d\\t%(context)s\\t%(subcontext)s\\t%(meth_level_string)s\\t%(meth_cytosines)s\\t%(all_cytosines)s\\n\' % locals())\n+ ATCGmap.close()\n+ CGmap.close()\n+ wiggle.close()\n+\n+ logm(\'Wiggle: %s\'% wiggle_fname)\n+ logm(\'ATCGMap: %s\' % ATCGmap_fname)\n+ logm(\'CGmap: %s\' % CGmap_fname)\n+\n' |
b |
diff -r 000000000000 -r e6df770c0e58 BSseeker2/bs_utils/.___init__.py |
b |
Binary file BSseeker2/bs_utils/.___init__.py has changed |
b |
diff -r 000000000000 -r e6df770c0e58 BSseeker2/bs_utils/._utils.py |
b |
Binary file BSseeker2/bs_utils/._utils.py has changed |
b |
diff -r 000000000000 -r e6df770c0e58 BSseeker2/bs_utils/__init__.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/BSseeker2/bs_utils/__init__.py Fri Jul 12 18:47:28 2013 -0400 |
b |
@@ -0,0 +1,1 @@ +__author__ = 'pf' |
b |
diff -r 000000000000 -r e6df770c0e58 BSseeker2/bs_utils/utils.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/BSseeker2/bs_utils/utils.py Fri Jul 12 18:47:28 2013 -0400 |
[ |
b'@@ -0,0 +1,332 @@\n+import gzip\n+import os\n+\n+import datetime\n+import re\n+import shlex\n+import shutil\n+import string\n+import subprocess\n+import types\n+#from itertools import izip\n+\n+import marshal\n+import sys\n+\n+\n+\n+_rc_trans = string.maketrans(\'ACGT\', \'TGCA\')\n+def reverse_compl_seq(strseq):\n+ return strseq.translate(_rc_trans)[::-1]\n+\n+\n+\n+def show_version() :\n+ print ""\n+ print " BS-Seeker2 v2.0.3 - May 19, 2013 "\n+ print ""\n+\n+\n+\n+"""\n+IUPAC nucleotide code Base\n+ A\tAdenine\n+ C\tCytosine\n+ G\tGuanine\n+ T (or U)\tThymine (or Uracil)\n+ R\tA or G\n+ Y\tC or T\n+ S\tG or C\n+ W\tA or T\n+ K\tG or T\n+ M\tA or C\n+ B\tC or G or T\n+ D\tA or G or T\n+ H\tA or C or T\n+ V\tA or C or G\n+ N\tany base\n+ . or -\tgap\n+"""\n+\n+\n+def IUPAC ( nuc ) :\n+ if nuc == \'R\' :\n+ return (\'A\',\'G\')\n+ elif nuc == \'Y\' :\n+ return (\'C\', \'T\')\n+ elif nuc == \'S\' :\n+ return (\'G\', \'C\')\n+ elif nuc == \'W\' :\n+ return (\'A\', \'T\')\n+ elif nuc == \'K\' :\n+ return (\'G\',\'T\')\n+ elif nuc == \'M\' :\n+ return (\'A\',\'C\')\n+ elif nuc == \'B\' :\n+ return (\'C\', \'G\', \'T\')\n+ elif nuc == \'D\' :\n+ return (\'A\', \'G\', \'T\')\n+ elif nuc == \'H\' :\n+ return (\'A\', \'C\', \'T\')\n+ elif nuc == \'V\' :\n+ return (\'A\', \'C\', \'G\')\n+ elif nuc == \'N\' :\n+ return (\'A\', \'C\', \'G\', \'T\')\n+ else :\n+ return (nuc)\n+\n+\n+def uniq(inlist):\n+ # order preserving\n+ uniques = []\n+ for item in inlist:\n+ if item not in uniques:\n+ uniques.append(item)\n+ return uniques\n+\n+from itertools import product\n+\n+def EnumerateIUPAC ( context_lst ) :\n+ tag_list = []\n+# context_lst = [context]\n+ for one_context in context_lst :\n+ for m in product(*[ IUPAC(i) for i in list(one_context)]) :\n+ tag_list.append(\'\'.join(m))\n+ return uniq(tag_list)\n+\n+from itertools import product\n+\n+# example: cut3_context="CGG"\n+# return generator for : ["CGG","TGG"]\n+# wild-card C to both C and T\n+def Enumerate_C_to_CT ( cut3_context_lst ) :\n+ tag_list = []\n+ for context in cut3_context_lst :\n+ for m in product(*[i if (i is not \'C\') else (\'C\',\'T\') for i in context]) :\n+ tag_list.append(\'\'.join(m))\n+ return uniq(tag_list)\n+\n+#-------------------------------------------------------------------------------------\n+\n+# set a reasonable defaults\n+def find_location(program):\n+ def is_exe(fpath):\n+ return os.path.exists(fpath) and os.access(fpath, os.X_OK)\n+ for path in os.environ["PATH"].split(os.pathsep):\n+ if is_exe(os.path.join(path, program)):\n+ return path\n+\n+ return None\n+\n+BOWTIE = \'bowtie\'\n+BOWTIE2 = \'bowtie2\'\n+SOAP = \'soap\'\n+RMAP = \'rmap\'\n+\n+supported_aligners = [\n+ BOWTIE,\n+ BOWTIE2,\n+ SOAP,\n+ RMAP\n+ ]\n+\n+aligner_options_prefixes = { BOWTIE : \'--bt-\',\n+ BOWTIE2 : \'--bt2-\',\n+ SOAP : \'--soap-\',\n+ RMAP : \'--rmap-\' }\n+\n+aligner_path = dict((aligner, os.path.expanduser(find_location(aligner) or default_path))\n+ for aligner, default_path in\n+ [(BOWTIE,\'~/bowtie-0.12.7/\'),\n+ (BOWTIE2, \'~/bowtie-0.12.7/\'),\n+ (SOAP, \'~/soap2.21release/\'),\n+ (RMAP, \'~/rmap_v2.05/bin\')\n+ ])\n+\n+\n+reference_genome_path = os.path.join(os.path.split(globals()[\'__file__\'])[0],\'reference_genomes\')\n+\n+\n+\n+def error(msg):\n+ print >> sys.stderr, \'ERROR: %s\' % msg\n+ exit(1)\n+\n+\n+global_stime = datetime.datetime.now()\n+def elapsed(msg = None):\n+ print "[%s]" % msg if msg is not None else "+", "Last:" , datetime.datetime.now() - elapsed.stime, \'\\tTotal:\', datetime.datetime.now() - global_stime\n+\n+ elapsed.stime = datetime.datetime.now()\n+\n+elapsed.stime = datetime.dateti'..b'pe(fname) in [list, types.GeneratorType]:\n+ delete_files(*list(fname))\n+ elif os.path.isdir(fname):\n+ shutil.rmtree(fname)\n+ else:\n+ os.remove(fname)\n+\n+def split_file(filename, output_prefix, nlines):\n+ """ Splits a file (equivalend to UNIX split -l ) """\n+ fno = 0\n+ lno = 0\n+ INPUT = open(filename, \'r\')\n+ output = None\n+ for l in INPUT:\n+ if lno == 0:\n+ fno += 1\n+ if output is not None: output.close()\n+ output = open(\'%s%d\' % (output_prefix, fno), \'w\')\n+ lno = nlines\n+ output.write(l)\n+ lno -= 1\n+ output.close()\n+ INPUT.close()\n+\n+def isplit_file(filename, output_prefix, nlines):\n+ """ Splits a file (equivalend to UNIX split -l ) """\n+ fno = 0\n+ lno = 0\n+ try :\n+ input = (gzip.open if filename.endswith(\'.gz\') else open)(filename, \'r\')\n+ except IOError :\n+ print "[Error] Cannot find file : %s !" % filename\n+ exit(-1)\n+ output = None\n+ output_fname = None\n+ for l in input:\n+ if lno == 0:\n+\n+ if output is not None:\n+ output.close()\n+ yield output_fname\n+\n+ fno += 1\n+ output_fname = \'%s%d\' % (output_prefix, fno)\n+# output_fname = \'%s_0\' % output_prefix\n+ output = open(output_fname, \'w\')\n+ lno = nlines\n+ output.write(l)\n+ lno -= 1\n+ if output is not None:\n+ output.close()\n+ yield output_fname\n+\n+ input.close()\n+\n+def read_fasta(fasta_file):\n+ """ Iterates over all sequences in a fasta file. One at a time, without reading the whole file into the main memory.\n+ """\n+\n+ try :\n+ input = (gzip.open if fasta_file.endswith(\'.gz\') else open)(fasta_file)\n+ except IOError:\n+ print "[Error] Cannot find fasta file : %s !" % fasta_file\n+ exit(-1)\n+ sanitize = re.compile(r\'[^ACTGN]\')\n+ sanitize_seq_id = re.compile(r\'[^A-Za-z0-9]\')\n+\n+ chrom_seq = []\n+ chrom_id = None\n+ seen_ids = set()\n+\n+ for line in input:\n+ if line[0] == \'>\':\n+ if chrom_id is not None:\n+ yield chrom_id, \'\'.join(chrom_seq)\n+\n+ chrom_id = sanitize_seq_id.sub(\'_\', line.split()[0][1:])\n+\n+ if chrom_id in seen_ids:\n+ error(\'BS Seeker found identical sequence ids (id: %s) in the fasta file: %s. Please, make sure that all sequence ids are unique and contain only alphanumeric characters: A-Za-z0-9_\' % (chrom_id, fasta_file))\n+ seen_ids.add(chrom_id)\n+\n+ chrom_seq = []\n+\n+ else:\n+ chrom_seq.append(sanitize.sub(\'N\', line.strip().upper()))\n+\n+ yield chrom_id, \'\'.join(chrom_seq)\n+\n+ input.close()\n+\n+\n+def serialize(obj, filename):\n+ """ Be careful what you serialize and deseriazlize! marshal.load is not secure!\n+ """\n+ output = open(filename+\'.data\', \'w\')\n+ marshal.dump(obj, output)\n+ output.close()\n+\n+def deserialize(filename):\n+ """ Be careful what you serialize and deseriazlize! marshal.load is not secure!\n+ """\n+ try:\n+ input = open(filename+\'.data\')\n+ except IOError:\n+ print "\\n[Error]:\\n\\t Cannot find file: %s.data" % filename\n+ exit(-1)\n+\n+ obj = marshal.load(input)\n+ input.close()\n+ return obj \n+\n+\n+\n+def run_in_parallel(commands):\n+\n+ commands = [(cmd[0], open(cmd[1], \'w\')) if type(cmd) is tuple else (cmd, None) for cmd in commands]\n+\n+ logm(\'Starting commands:\\n\' + \'\\n\'.join([cmd for cmd, stdout in commands]))\n+ for i, proc in enumerate([subprocess.Popen(args = shlex.split(cmd), stdout = stdout) for cmd, stdout in commands]):\n+ return_code = proc.wait()\n+ logm(\'Finished: \' + commands[i][0])\n+ if return_code != 0:\n+ error(\'%s \\nexit with an error code: %d. Please, check the log files.\' % (commands[i], return_code))\n+ for _, stdout in commands:\n+ if stdout is not None:\n+ stdout.close()\n' |
b |
diff -r 000000000000 -r e6df770c0e58 BSseeker2/galaxy/.___init__.py |
b |
Binary file BSseeker2/galaxy/.___init__.py has changed |
b |
diff -r 000000000000 -r e6df770c0e58 BSseeker2/galaxy/._bs_seeker2_wrapper.py |
b |
Binary file BSseeker2/galaxy/._bs_seeker2_wrapper.py has changed |
b |
diff -r 000000000000 -r e6df770c0e58 BSseeker2/galaxy/._bs_seeker2_wrapper.xml |
b |
Binary file BSseeker2/galaxy/._bs_seeker2_wrapper.xml has changed |
b |
diff -r 000000000000 -r e6df770c0e58 BSseeker2/galaxy/__init__.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/BSseeker2/galaxy/__init__.py Fri Jul 12 18:47:28 2013 -0400 |
b |
@@ -0,0 +1,1 @@ +__author__ = 'pf' |
b |
diff -r 000000000000 -r e6df770c0e58 BSseeker2/galaxy/bs_seeker2_wrapper.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/BSseeker2/galaxy/bs_seeker2_wrapper.py Fri Jul 12 18:47:28 2013 -0400 |
[ |
@@ -0,0 +1,123 @@ +import tempfile + +__author__ = 'pf' +from subprocess import Popen +from collections import defaultdict +import sys, shutil, os, re + + + +BUILD = 'build' +ALIGN = 'align' +CALL_METHYLATION = 'call_methylation' + +EXEC = 'exec' +EXEC_PATH = EXEC+'-path' +ARG_TYPES = [BUILD, ALIGN, CALL_METHYLATION, EXEC] + +USAGE = """ +%(script)s is a wrapper script for bs_seeker2-build.py and bs_seeker2-align.py that is intended to be used with the Galaxy web platform. +The script takes command line parameters and runs bs_seeker2-align.py and bs_seeker2-build.py, if neccessary. + +The parameters that are related to bs_seeker2-build.py must be prefixed with --%(build_tag)s. +The parameters that are related to bs_seeker2-align.py must be prefixed with --%(align_tag)s. +Additionally, the path to BS-Seeker2 has to be specified via the --%(exec_path)s option. + +For example: + + python %(script)s --%(exec_path)s /mnt/Data/UCLA/Matteo/BS-Seeker --build-f data/arabidopsis/genome/Arabidopsis.fa --align-i data/arabidopsis/BS6_N1try2L7_seq.txt.fa --align-o data/arabidopsis/BS6_N1try2L7_seq.txt.fa.test_output + +This will run build the genome in Arabidopsis.fa and put the indexes in a temporary directory. bs_seeker2-align.py will be run on the +newly created genome index. I.e. the following two commands will be run in a shell: + + python /mnt/Data/UCLA/Matteo/BS-Seeker/bs_seeker2-build.py --db /tmp/tmpg8Eq1o -f /mnt/Data/UCLA/Matteo/bck_BS-Seeker/data/arabidopsis/genome/Arabidopsis.fa + + python /mnt/Data/UCLA/Matteo/BS-Seeker/bs_seeker2-align.py --db /tmp/tmpg8Eq1o -o /mnt/Data/UCLA/Matteo/bck_BS-Seeker/data/arabidopsis/BS6_N1try2L7_seq.txt.fa.test_output -i /mnt/Data/UCLA/Matteo/bck_BS-Seeker/data/arabidopsis/BS6_N1try2L7_seq.txt.fa -g Arabidopsis.fa + + +The temporary directory will be deleted after the wrapper exits. + + +If no options related to bs_seeker2-build are passed, no genome index will be built and the corresponding pre-built genome index will be used +instead. No temporary files and directories will be created. + +""" % { 'script' : os.path.split(__file__)[1], 'build_tag' :BUILD, 'align_tag' : ALIGN, 'exec_path' : EXEC_PATH } + + +def error(msg): + print >> sys.stderr, 'ERROR: %s' % msg + exit(1) + + +if __name__ == '__main__': + if len(sys.argv) == 1: + error('No parameters\n\n'+USAGE) + + + # Parse command line arguments + args = defaultdict(dict) + arg_key = None + arg_val = None + arg_type = None + + for arg in sys.argv[1:]: + if arg.startswith('--'): + try: + arg_type, arg_key = re.match(r'--(\w+)(.*)', arg).groups() + if arg_type not in ARG_TYPES: + raise Exception("Bad argument: %s. arg_type (%s) must be one of: %s." % (arg, arg_type, ', '.join(ARG_TYPES))) + if not arg_key or arg_key[0] != '-': + raise Exception("Bad argument: %s. arg_key (%s) must start with - or --." % (arg, arg_key)) + except Exception, e: + error(str(e) + '\n\n' + USAGE) + args[arg_type][arg_key] = '' + else: + args[arg_type][arg_key] = arg + + path_to_bs_seeker = args.get('exec', {'-path' : None})['-path'] # return None when exec not found + if path_to_bs_seeker is None: + error('You have to specify the path to BS-Seeker2 via --%s\n\n' % EXEC_PATH + USAGE) + + tempdir = None + def run_prog(prog, params): + cwd, _ = os.path.split(__file__) + cmd = 'python %(prog)s %(params)s' % { + 'prog' : os.path.join(cwd, prog), + 'params' : ' '.join('%s %s' % (arg_key, arg_val) for arg_key, arg_val in params.items()) + } + print 'exec:', cmd + + return_code = Popen(args = cmd, shell = True).wait() + if return_code: + if tempdir: + shutil.rmtree(tempdir) + error("%s exit with error code %d" % (prog, return_code)) + tempdir = tempfile.mkdtemp() + + # bs_seeker2-build + if BUILD in args: + args[BUILD]['--db'] = tempdir + args[ALIGN]['--db'] = tempdir + run_prog(os.path.join(path_to_bs_seeker, 'bs_seeker2-build.py'), args[BUILD]) + + # bs_seeker2-align + args[ALIGN]['--temp_dir'] = tempdir + run_prog(os.path.join(path_to_bs_seeker, 'bs_seeker2-align.py'), args[ALIGN]) + + + def getopt(h, k1, k2, default): + return h.get(k1, h.get(k2, default)) + # bs_seeker2-call_methylation + args[CALL_METHYLATION].update({ '-i' : args[ALIGN]['--output'], + '--db' : os.path.join(args[ALIGN]['--db'], + os.path.split( getopt(args[ALIGN],'-g', '--genome', None))[1] + + ('_rrbs_%s_%s' % (getopt(args[ALIGN], '-l', '--low', '40'), + getopt(args[ALIGN], '-u', '--up', '500')) + if len(set(['-r', '--rrbs']) & set(args[ALIGN])) > 0 else '') + + + '_' + args[ALIGN]['--aligner']) + }) + run_prog(os.path.join(path_to_bs_seeker, 'bs_seeker2-call_methylation.py'), args[CALL_METHYLATION]) + + if tempdir: + shutil.rmtree(tempdir) \ No newline at end of file |
b |
diff -r 000000000000 -r e6df770c0e58 BSseeker2/galaxy/bs_seeker2_wrapper.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/BSseeker2/galaxy/bs_seeker2_wrapper.xml Fri Jul 12 18:47:28 2013 -0400 |
[ |
b'@@ -0,0 +1,255 @@\n+<tool id="bs_seeker_wrapper" name="BS-Seeker2" version="2.0.0">\n+ <requirements><requirement type=\'package\'>bs_seeker2</requirement></requirements>\n+ <description>Versatile aligner for bisulfite sequencing data</description>\n+ <command interpreter="python">\n+ bs_seeker2_wrapper.py\n+ ### define exec path\n+ ### --exec-path "/u/home/galaxy/galaxy/GalaxyTools/bin"\n+ ### [Please change the following path to your local directory]\n+ --exec-path "/Users/weilong/Documents/program/BSseeker2"\n+ ### output\n+ --align--output $align_output\n+ --call_methylation--wig $call_methylation_wig\n+ --call_methylation--CGmap $call_methylation_CGmap\n+ --call_methylation--ATCGmap $call_methylation_ATCGmap\n+ --call_methylation--txt\n+\n+ #if $singlePaired.sPaired == "paired"\n+ --align--input_1 $input1\n+ --align--input_2 $singlePaired.input2\n+ #end if\n+\n+\n+ ### aligner\n+ --align--aligner ${choosealigner.aligner}\n+\n+ ### Index from history or built-in\n+ #if $choosealigner.rrbsFragments.refGenomeSource.genomeSource == "history"\n+ --build--file ${choosealigner.rrbsFragments.refGenomeSource.ownFile}\n+ --build--aligner ${choosealigner.aligner}\n+ --align-g ${choosealigner.rrbsFragments.refGenomeSource.ownFile}\n+ --align--db ${choosealigner.rrbsFragments.refGenomeSource.ownFile}\n+ #else if $choosealigner.rrbsFragments.refGenomeSource.genomeSource == "indexed"\n+ --align--db ${choosealigner.rrbsFragments.refGenomeSource.index.fields.path}\n+ --align-g ${choosealigner.rrbsFragments.refGenomeSource.index.fields.path}/${choosealigner.rrbsFragments.refGenomeSource.index.fields.dbkey}.fa\n+\n+ #end if\n+\n+ ### RRBS or WGBS\n+ #if $choosealigner.rrbsFragments.Fragmented == "Yes"\n+ #if $choosealigner.rrbsFragments.refGenomeSource.genomeSource == "history"\n+ --build--rrbs\n+ --build--low ${choosealigner.rrbsFragments.lowerBound}\n+ --build--up ${choosealigner.rrbsFragments.upperBound}\n+ #end if\n+ --align--rrbs\n+ --align--low ${choosealigner.rrbsFragments.lowerBound}\n+ --align--up ${choosealigner.rrbsFragments.upperBound}\n+ #end if\n+\n+\n+\n+ ### Inputs\n+ #if $singlePaired.sPaired == "single"\n+ --align-i $input1\n+ #end if\n+\n+ ### Library type\n+ --align-t $tag\n+\n+ ### other general options\n+ #if $sParams.sSettingsType == "preSet"\n+ --align--start_base 1\n+ --align--end_base 200\n+ --align--mis 4\n+ #end if\n+\n+ ### adapter information\n+ #if $adapterInfo.useAdapter == "Yes"\n+ --align--adapter ${adapterInfo.adapter_file}\n+ #end if\n+\n+ #if $sParams.sSettingsType == "full"\n+ --align--start_base ${sParams.start_base}\n+ --align--end_base ${sParams.end_base}\n+ --align--mis ${sParams.num_mismatch}\n+ #end if\n+\n+ </command>\n+ <inputs>\n+ <param format="fastq,fasta,qseq" name="input1" type="data" label="Input your read file" help="reads file in fastq, qseq or fasta format" />\n+ <conditional name="singlePaired">\n+ <param name="sPaired" type="select" label="Is this library mate-paired?">\n+ <option value="single">Single-end</option>\n+ <option value="paired">Paired-end</option>\n+ </param>\n+ <when value="paired">\n+ <param format="fastq,fasta,qseq" name="input2" type="data" label="Input your read file 2" help="reads in fastq, qseq or fasta format" />\n+ <param name="min_ins_distance" type="integer" value="-1" label=" Minimum insert size for valid paired-end alignments" />\n+ <param name="max_ins_distance" type="integer" value="400" label="Maximum insert size for valid paired-end alignments" />\n+ </when>\n+ </conditional> \n+ <param name="tag" type="select" label="Type of librarie'..b'+ <when value="history">\n+ <param name="ownFile" type="data" format="fasta" metadata_name="dbkey" label="Select the reference genome" />\n+ </when>\n+ </conditional>\n+ </when>\n+ <when value="No">\n+ <conditional name="refGenomeSource">\n+ <param name="genomeSource" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="">\n+ <option value="indexed">Use a built-in index</option>\n+ <option value="history">Use one from the history</option>\n+ </param>\n+ <when value="indexed">\n+ <param name="index" type="select" label="Select a reference genome (WGBS, bowtie2)">\n+ <options from_data_table="bs_seeker2_indexes_WGBS_bowtie2">\n+ <filter type="sort_by" column="2"/>\n+ <validator type="no_options" message="No indexes are available for the selected input dataset"/>\n+ </options>\n+ </param>\n+ </when>\n+ <when value="history">\n+ <param name="ownFile" type="data" format="fasta" metadata_name="dbkey" label="Select the reference genome" />\n+ </when>\n+ </conditional>\n+ </when>\n+ </conditional>\n+ </when>\n+ </conditional>\n+ <conditional name="adapterInfo">\n+ <param name="useAdapter" type="select" label="adapter sequence">\n+ <option value="noAdapter">No</option>\n+ <option value="withAdapter">Yes</option>\n+ </param>\n+ <when value="withAdapter">\n+ <param format="txt" name="adapter_file" type="data" label="Input file of your adaptor sequences" help="Input text file of your adaptor sequences" />\n+ </when>\n+ </conditional>\n+\n+ <conditional name="sParams">\n+ <param name="sSettingsType" type="select" label="BS Seeker2 settings to use" help="You can use the default settings or set customer values for the BS Seeker2 parameters.">\n+ <option value="preSet">User Defaults</option>\n+ <option value="full">Full parameter list</option>\n+ </param>\n+ <when value="preSet" /> \n+ <when value="full">\n+ <param name="start_base" type="integer" value="1" label="The start base of the read to be mapped" help="" />\n+ <param name="end_base" type="integer" value="200" label="The end base of the read to be mapped" help="" />\n+\n+ <param name="num_mismatch" type="integer" value="4" label="Number of mismatches" help="(INT) Default: 4" />\n+ </when> \n+ </conditional>\n+\n+</inputs>\n+\n+ <outputs>\n+ <data format="bam" name="align_output" label="BAM Alignments"> </data>\n+ <data format="wig" name="call_methylation_wig" label="Methylation Levels"> </data>\n+ <data format="tabular" name="call_methylation_CGmap" label="CGmap file"> </data>\n+ <data format="tabular" name="call_methylation_ATCGmap" label="ATCGmap file"> </data>\n+\n+ </outputs>\n+ <help>\n+**What it does**\n+\n+BS-Seeker2 is a seamlessly pipeline for mapping bisulfite sequencing data and generating detailed DNA methylome. BS-Seeker2 improves mappability by using local alignment, and is tailored for RRBS library by building special index, with higher efficiency and accuracy. This is the Galaxy version of BS-Seeker2.\n+\n+------\n+\n+**Resources**\n+\n+The homepage for BS-Seeker2 is http://pellegrini.mcdb.ucla.edu/BS_Seeker2/.\n+\n+For more information of BS-Seeker2, please refer to https://github.com/BSSeeker/BSseeker2.\n+\n+------\n+\n+**Example**\n+\n+- Adapter file::\n+\n+ AGATCGGAAGAGCACACGTC\n+\n+\n+ </help>\n+</tool>\n' |