str_fm: microsatellite.py comparison

comparison microsatellite.py @ 0:07588b899c13 draft

Uploaded

author	arkarachai-fungtammasan
date	Wed, 01 Apr 2015 17:05:51 -0400
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:07588b899c13
+#!/usr/bin/env python
+"""
+Snoop thru a fasta file looking for microsatellite repeats of given periods
+Output format: length_of_repeat left_flank_length right_flank_length  repeat_motif  hamming_distance  read_name read_sequence read_quality  (additional columns)
+If --r option turned on, output format will have additional columns behind:
+read_name read_chr  pre_s pre_e tr_s  tr_e  suf_s suf_e tr_len  tr_ref_seq
+pre_s           where the read start
+pre_e           the last position before microsatellite
+tr_s            where microsatellite start
+tr_e            where microsatellite end
+suf_s           first base after microsatellite
+tr_ref_seq      reference sequence corresponding to microsatellite
+* output positions are 0 based
+:Author: Chen Sun (cxs1031@cse.psu.edu); Bob Harris (rsharris@bx.psu.edu)
+modifing log:
+09/27/2013
+replace function dense_intervals with function non_negative_intervals, which do not need to import such file.
+10/18/2013
+modify function find_repeat_element to get a quick speed, under the condition that hamming_distance = 0, which means do not allowed any mutation/indel
+02/25/2014
+add function that can deal with mapped reads
+with additional output
+02/28/2014
+modify the 0-based end point, as in 0-base area, it is half-open [ )
+so the 0-based site, should always be added by 1
+03/05/2014
+deal with multi-fasta
+"""
+from sys          import argv,stdin,stderr,exit
+from string       import maketrans
+from md5          import new as md5_new
+import re
+#from pyfracluster import dense_intervals
+def usage(s=None):
+message = """
+usage: microsat_snoop [fasta_file] [options]
+<fasta_file>                Name of file to read sequences from;  if absent,
+sequences are read from stdin
+--fasta                     Input file is in fasta format
+(this is the default)
+--fastq                     Input file is in fastq format
+(default is fasta unless filename is .fastq)
+--fastq:noquals             Input file is in fastq format, but discard quals
+--sam                       Input file is SAM file
+--r                         Indicate additional output information, if indicated,
+--ref option is mendatory
+--ref=<filepath>            Reference file (absolute) path
+--period=<length>           (mandatory,cumulative) repeat length(s) to be
+searched for
+<length> is expected to be small, less than 10
+<length> can also be a comma-separated list, or
+a range <low>..<high>
+--rate=<fraction>           control the candidate repeat interval detector;
+it will consider intervals with at least
+<fraction> of matches when shifted by the period;
+<fraction> is between 0 and 1 and can be either a
+real number or <n>/<d>
+(default is 6/7)
+--minlength=<length>        minimum length of intervals reported, in bp
+(default is 20)
+--progress=<count>          how often to report the sequence we're searching
+(default is no progress report)
+--allowduplicates           process all input sequences
+(this is the default)
+--noduplicates              ignore any input sequence that's the same as an
+earlier sequence
+--nonearduplicates          ignore any input sequence that has the same first
+100 bp as an earlier sequence
+--nonearduplicate=<length>  ignore any input sequence that has the same first
+<length> bp as an earlier sequence
+--hamming=<count>           Don't report candidate repeat intervals that have
+more than <count> mismatches
+(default is to do no such filtering)
+--prefix=<length>           Don't report candidate repeat intervals that
+start within <length> of the sequence start
+(default is to do no such filtering)
+--suffix=<length>           Don't report candidate repeat intervals that
+end within <length> of the sequence end
+(default is to do no such filtering)
+--subsample=<k>/<n>         Process only the <k>th sequence of every group of
+<n> sequences;  <k> ranges from 1 to <n>
+--multipleruns              Consider all candidate intervals in a sequence
+(default is to consider only the longest)
+--partialmotifs             Consider microatelites with a partial motif
+(default is to consider only whole motifs)
+--splitbyvalidity           Preprocess sequences, splitting at Ns;  this
+prevents candidates from including Ns
+(default is not to split)
+--noflankdisplay            Show entire sequence as flanking regions
+(this is the default)
+--flankdisplay=<length>     Limit length of flanking regions shown
+--readnamesuffix=<string>   Root of suffix to append to read names;  e.g. 1
+for forward, 2 for reverse;  this triggers other
+info to be included in the suffix
+(default is "1" for fastq;  no suffix for fasta)
+--head=<number>             limit the number of sequences processed
+--markend                   Write a marker line upon completion
+(default is not to write a marker)
+--help=details              Describe the process, and quit"""
+if (s == None): exit (message)
+else:           exit ("%s\n%s" % (s,message))
+detailedDescription = """In broad terms, the process works as follows:
+(1) Identify intervals that are highly correlated with the interval shifted by
+P (the repeat period).  These intervals are called "runs" or "candidates".
+The level of correlation required is controlled by rateThreshold.
+Depending on whether we want to look for more than one microsat, we either
+find the longest such run (simple algorithm) or many runs (more complicated
+algorithm). The following steps are then performed on each run.
+(2) Find the most likely repeat motif in the run.  This is done by counting
+all kmers (of length P) and choosing the most frequent.  If that kmer is
+itself covered by a sub-repeat we discard this run.  The idea is that we
+can ignore a 6-mer like ACGACG because we will find it when we are looking
+for 3-mers.
+(3) Once we identify the most likely repeat motif, we then modify the
+interval, adjusting start and end to find the interval that has the fewest
+mismatches vs. a sequence of the motif repeated (hamming distance).  Only
+whole copies of the motif are considered.
+(4) At this point we have a valid microsat interval (in the eyes of the
+program). It is subjected to some filtering stages (hamming distance or too
+close to an end), and if it satisfies those conditions, it's reported to
+the user."""
+def main():
+global debug
+#=== parse the command line ===
+inputFilename         = None
+referenceFileName     = None #add by Chen Sun on 02/25
+inputFormat           = None
+repeatPeriods         = []
+rateThreshold         = 6 / 7.0
+lengthThreshold       = 20
+reportProgress        = None
+discardDuplicates     = False
+discardNearDuplicates = False
+nearDuplicatePrefix   = 100
+hammingThreshold      = 0
+prefixThreshold       = None
+suffixThreshold       = None
+subsampleK            = None
+subsampleN            = None
+reportMultipleRuns    = False
+allowPartialMotifs    = False
+splitByValidity       = False
+flankDisplayLimit     = None
+readNameSuffix        = None
+headLimit             = None
+markEndOfFile         = False
+additionalInfo        = False
+debug                 = []
+for arg in argv[1:]:
+if (arg == "--fasta"):
+inputFormat = "fasta"
+elif (arg == "--fastq"):
+inputFormat = "fastq"
+elif (arg == "--fastq:noquals"):
+inputFormat = "fastq:noquals"
+elif (arg == "--sam"):
+inputFormat = "sam"
+elif (arg == "--r"):
+additionalInfo = True
+elif (arg.startswith("--ref=")):
+referenceFileName = arg.split("=",1)[1]
+elif (arg.startswith("--period=")):
+val = arg.split("=",1)[1]
+for period in val.split(","):
+if (".." in period):
+(lowPeriod,highPeriod) = period.split("..",1)
+lowPeriod  = int(lowPeriod)
+highPeriod = int(highPeriod)
+for period in xrange(lowPeriod,highPeriod+1):
+repeatPeriods += [period]
+else:
+repeatPeriods += [int(period)]
+elif (arg.startswith("--rate=")):
+val = arg.split("=",1)[1]
+rateThreshold = float_or_fraction(val)
+assert (0.0 < rateThreshold <= 1.0), "%s not a valid rate" % val
+elif (arg.startswith("--minlength=")):
+val = arg.split("=",1)[1]
+lengthThreshold = int(val)
+assert (lengthThreshold >= 0)
+elif (arg.startswith("--progress=")):
+val = arg.split("=",1)[1]
+reportProgress = int(val)
+elif (arg == "--allowduplicates"):
+discardDuplicates     = False
+discardNearDuplicates = False
+elif (arg == "--noduplicates"):
+discardDuplicates     = True
+discardNearDuplicates = False
+elif (arg == "--nonearduplicates"):
+discardDuplicates     = False
+discardNearDuplicates = True
+elif (arg.startswith("--nonearduplicate=")):
+val = arg.split("=",1)[1]
+discardDuplicates     = False
+discardNearDuplicates = True
+nearDuplicatePrefix   = int(val)
+assert (nearDuplicatePrefix > 0)
+elif (arg.startswith("--hamming=")):
+val = arg.split("=",1)[1]
+hammingThreshold = int(val)
+assert (hammingThreshold >= 0)
+elif (arg.startswith("--prefix=")):
+val = arg.split("=",1)[1]
+prefixThreshold = int(val)
+assert (prefixThreshold >= 0)
+elif (arg.startswith("--suffix=")):
+val = arg.split("=",1)[1]
+suffixThreshold = int(val)
+assert (suffixThreshold >= 0)
+elif (arg.startswith("--subsample=")):
+val = arg.split("=",1)[1]
+(k,n) = val.split("/",2)
+subsampleK = int(k)
+subsampleN = int(n)
+assert (0 < subsampleK <= subsampleN)
+elif (arg == "--multipleruns"):
+reportMultipleRuns = True
+elif (arg == "--partialmotifs"):
+allowPartialMotifs = True
+elif (arg == "--splitbyvalidity"):
+splitByValidity = True
+elif (arg == "--noflankdisplay"):
+flankDisplayLimit = None
+elif (arg.startswith("--flankdisplay=")):
+val = arg.split("=",1)[1]
+flankDisplayLimit = int(val)
+assert (flankDisplayLimit >= 0)
+elif (arg.startswith("--readnamesuffix")):
+readNameSuffix = arg.split("=",1)[1]
+elif (arg.startswith("--head=")):
+headLimit = int_with_unit(arg.split("=",1)[1])
+elif (arg == "--markend"):
+markEndOfFile = True
+elif (arg == "--help=details"):
+exit (detailedDescription)
+elif (arg.startswith("--debug=")):
+debug += (arg.split("=",1)[1]).split(",")
+elif (arg.startswith("--")):
+usage("unrecognized option: %s" % arg)
+elif (inputFilename == None):
+inputFilename = arg
+else:
+usage("unrecognized option: %s" % arg)
+#=== determine periods of interest ===
+if (repeatPeriods == []):
+usage("you gotta give me a repeat period")
+if (additionalInfo == True):
+if (referenceFileName == None):
+usage("reference file path needed. use --ref=<reference> to indicate")
+periodSeed = {}
+for period in repeatPeriods:
+if (period < 1): usage("period %d is not valid" % period)
+periodSeed[period] = True
+repeatPeriods = [period for period in periodSeed]
+repeatPeriods.sort()
+#=== determine input format ===
+if   (inputFormat == "fasta"):           sequence_reader = fasta_sequences
+elif (inputFormat == "fastq"):           sequence_reader = fastq_sequences
+elif (inputFormat == "fastq:noquals"):   sequence_reader = fastq_sequences
+elif (inputFormat == "sam"):             sequence_reader = sam_sequences
+elif (inputFilename == None):            sequence_reader = fasta_sequences
+elif (inputFilename.endswith(".fastq")): sequence_reader = fastq_sequences
+elif (inputFilename.endswith(".fq")):    sequence_reader = fastq_sequences
+elif (inputFilename.endswith(".sam")):   sequence_reader = sam_sequences
+else:                                    sequence_reader = fasta_sequences
+if (inputFilename != None): inputF = file(inputFilename,"rt")
+else:                       inputF = stdin
+if   (readNameSuffix == None) \
+and (sequence_reader == fastq_sequences) \
+and (inputFormat != "fastq:noquals"):
+readNameSuffix = "1"
+#=== process the sequences ===
+refSequence = {}
+rightName = ""
+sequence = ""
+if additionalInfo:
+firstFasta = True
+originalRefF = open(referenceFileName)
+for line in originalRefF.readlines():
+line = line.replace('\r','')
+line = line.replace('\n','')
+if line.startswith(">"):
+if firstFasta:
+firstFasta = False
+else:
+refSequence[rightName] = sequence
+rightName = line[1:]
+sequence = ""
+continue
+sequence += line
+originalRefF.close()
+refSequence[rightName] = sequence
+sequenceSeen = {}
+numSequences = 0
+for seqInfo in sequence_reader(inputF):
+numSequences += 1
+if (headLimit != None) and (numSequences > headLimit):
+print >>stderr, "limit of %d sequences reached" % headLimit
+break
+if (sequence_reader == sam_sequences):
+#seqName,"".join(seqNucs).upper().translate(nonDnaMap), refName, pre_s, cigar
+(name, sequence, refName, pre_s, cigar) = seqInfo
+quals = None
+elif (sequence_reader == fastq_sequences):
+(name,sequence,quals) = seqInfo
+if (inputFormat == "fastq:noquals"): quals = None
+else:
+(name,sequence) = seqInfo
+quals = None
+if (reportProgress != None) and (numSequences % reportProgress == 0):
+print >>stderr, "%s %d" % (name,numSequences)
+# if we're subsampling and not interested in this sequence, skip it
+if (subsampleN != None):
+if ((numSequences-1) % subsampleN != (subsampleK-1)):
+continue
+# if this sequence is shorter than the length of interest, skip it
+seqLen = len(sequence)
+if (seqLen < period) or (seqLen < lengthThreshold): continue
+# if we're not interested in duplicates and this is one, skip it;
+# note that we assume no hash collisions occur, i.e. that all hash
+# matches are truly sequence matches
+if (discardDuplicates):
+h = hash108(sequence)
+if (h in sequenceSeen): continue
+sequenceSeen[h] = True
+elif (discardNearDuplicates):
+h = hash108(sequence[:nearDuplicatePrefix])
+if (h in sequenceSeen): continue
+sequenceSeen[h] = True
+# split the sequence into chunks of valid nucleotides
+if (splitByValidity):
+chunks = [(start,end) for (start,end) in nucleotide_runs(sequence)]
+else:
+chunks = [(0,len(sequence))]
+# evaluate for each period of interest
+for period in repeatPeriods:
+# operate on each chunk
+for (chunkStart,chunkEnd) in chunks:
+chunkLen = chunkEnd - chunkStart
+if (chunkLen < period) or (chunkLen < lengthThreshold): continue
+if ("validity" in debug) or ("correlation" in debug) or ("runs" in debug):
+print >>stderr, ">%s_%d_%d" % (name,chunkStart,chunkEnd)
+# compute correlation sequence
+corr = correlation_sequence(sequence,period,chunkStart,chunkEnd)
+if ("correlation" in debug) or ("runs" in debug):
+print >>stderr, sequence[chunkStart:chunkEnd]
+print >>stderr, corr
+# find runs (candidates for being a microsat)
+if (reportMultipleRuns):
+runs = all_suitable_runs(corr,lengthThreshold-period,rateThreshold, hammingThreshold)
+else:
+runs = longest_suitable_run(corr,lengthThreshold,rateThreshold)
+if (runs == []): continue
+if ("runs" in debug):
+for (start,end) in runs:
+run = [" "] * seqLen
+for ix in xrange(start-period,end):
+run[ix] = "*"
+print >>stderr, "".join(run)
+if ("candidates" in debug):
+for (start,end) in runs:
+print >>stderr, "%s %d %d" % (name,start,end)
+# process runs and report those that pass muster
+runCount = 0
+for (start,end) in runs:
+runCount += 1
+start = chunkStart + start - period
+end   = chunkStart + end
+(kmer,d,start,end) = find_repeat_element(hammingThreshold, period,sequence,start,end,allowPartials=allowPartialMotifs)
+if (kmer == None): continue    # (no useful repeat kmer was found)
+rptExtent = end - start
+prefixLen = start
+suffixLen = seqLen - end
+if (rptExtent <= period): continue
+if (hammingThreshold != None) and (d         > hammingThreshold): continue
+if (prefixThreshold  != None) and (prefixLen < prefixThreshold):  continue
+if (suffixThreshold  != None) and (suffixLen < suffixThreshold):  continue
+if (flankDisplayLimit == None):
+seq = sequence[:start] \
++ sequence[start:end].lower() \
++ sequence[end:]
+else:
+seq = sequence[max(chunkStart,start-flankDisplayLimit):start] \
++ sequence[start:end].lower() \
++ sequence[end:min(chunkEnd,end+flankDisplayLimit)]
+reportName = name
+if (readNameSuffix != None):
+reportName += "_"+readNameSuffix+"_per"+str(period)+"_"+str(runCount)
+if (quals == None or quals == "." or quals == "\t."): quals = "\t."
+else:               quals = "\t" + quals
+if not additionalInfo:
+print "%d\t%d\t%d\t%s\t%d\t%s\t%s%s" \
+% (rptExtent,prefixLen,suffixLen,kmer,d,reportName,seq,quals)
+else:
+#pre_e = pre_s + prefixLen - 1
+refPoint = pre_s
+donorPoint = 0
+donorBeforeStart = prefixLen - 1 #pre_e
+donorMicroStart = prefixLen     #tr_s
+donorMicroEnd = donorMicroStart + rptExtent - 1 #tr_e
+donorAfterMicro = donorMicroEnd + 1 #suf_s
+donorEnd = len(seq) - 1    #suf_e
+set_pre_e = False
+set_tr_s = False
+set_tr_e = False
+set_suf_s = False
+set_suf_e = False
+pre_e = 0
+tr_s = 0
+tr_e = 0
+suf_s = 0
+suf_e = 0
+matchList = re.findall('(\d+)([IDM])', cigar)
+unCognitiveCigar = False
+for matchN, matchType in matchList:
+matchNum = int(matchN)
+if matchType == "M":
+donorPoint = donorPoint + matchNum
+refPoint = refPoint + matchNum
+elif matchType == "D":
+refPoint = refPoint + matchNum
+continue
+elif matchType == "I":
+donorPoint = donorPoint + matchNum
+else:
+unCognitiveCigar = True
+break
+if not set_pre_e:
+if donorPoint >= donorBeforeStart:
+pre_e = refPoint - (donorPoint - donorBeforeStart)
+set_pre_e = True
+else:
+continue
+if not set_tr_s:
+if donorPoint >= donorMicroStart:
+tr_s = refPoint - (donorPoint - donorMicroStart)
+set_tr_s = True
+else:
+continue
+if not set_tr_e:
+if donorPoint >= donorMicroEnd:
+tr_e = refPoint - (donorPoint - donorMicroEnd)
+set_tr_e = True
+else:
+continue
+if not set_suf_s:
+if donorPoint >= donorAfterMicro:
+suf_s = refPoint - (donorPoint - donorAfterMicro)
+set_suf_s = True
+else:
+continue
+if not set_suf_e:
+if donorPoint >= donorEnd:
+suf_e = refPoint - (donorPoint - donorEnd)
+set_suf_e = True
+else:
+continue
+if unCognitiveCigar:
+break
+tr_len = tr_e - tr_s + 1
+if refName not in refSequence:
+tr_ref_seq = "."
+else:
+if refSequence[refName] == "":
+tr_ref_seq = "."
+elif len(refSequence[refName]) <= tr_e:
+tr_ref_seq = "."
+else:
+tr_ref_seq = refSequence[refName][tr_s:tr_e+1]
+pre_e += 1
+tr_e += 1
+suf_e += 1
+print "%d\t%d\t%d\t%s\t%d\t%s\t%s%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%s" \
+% (rptExtent,prefixLen,suffixLen,kmer,d,reportName,seq,quals,reportName,refName,pre_s,pre_e,tr_s,tr_e,suf_s,suf_e,tr_len,tr_ref_seq)
+if (markEndOfFile):
+print "# microsat_snoop end-of-file"
+if (inputF != stdin):
+inputF.close()
+# non_negative_intervals
+#     find intervals with exactly + and no -
+#     from string like this : +++++++++---+++++++++
+def non_negative_intervals(seq, minLength=None):
+start = -1
+end = -1
+firstPlus = 1
+#print seq
+for ix in range(len(seq)): # for every char in seq
+ch = seq[ix]
+if(ch == "+"):
+if(firstPlus):
+firstPlus = 0
+start = ix
+else:
+continue
+elif(ch == "-"):
+if(start >= 0):
+end = ix-1
+if((end - start + 1) >= minLength):
+yield (start,end+1)
+start = -1
+firstPlus = 1
+if(start > 0):
+if((ix - start + 1) >= minLength):
+yield (start, ix+1)
+###################################################################
+# modified by Chen Sun on 7/11/2014
+# We do not want other modules, so parse these functions inside
+#
+###################################################################
+# parse a string of the form {positives}/{positives_and_neutrals}
+def parse_spec(s):
+if ("/" not in s): raise ValueError
+(n,d) = s.split("/",1)
+if (not n.startswith("{")) or (not n.endswith("}")): raise ValueError
+if (not d.startswith("{")) or (not d.endswith("}")): raise ValueError
+positives = n[1:-1]
+d         = d[1:-1]
+for ch in positives:
+if (ch not in d): raise ValueError
+neutrals = [ch for ch in d if (ch not in positives)]
+return (positives,neutrals)
+# convert a string to a number, allowing fractions
+def float_or_fraction(s):
+if ("/" in s):
+(numer,denom) = s.split("/",1)
+return float(numer)/float(denom)
+else:
+return float(s)
+# dense_intervals--
+#    Find all non-overlapping runs with a good enough rate (of positives), and
+#    which meet our length threshold.
+#
+#    The algorithm used is adapted from Zhang, Berman, Miller, "Post-processing
+#    long pairwise alignments", Bioinformatics Vol. 15 no. 12 1999.
+#
+# $$$ we use the denominator as the threshold, but we really should use the
+# $$$ .. numerator, comparing it to minLength*rate
+def dense_intervals(seq,rate,positives,neutrals,blockers="",minLength=None):
+if (blockers == None):
+blockers = "".join([chr(n) for n in range(1,256)
+if  (chr(n) not in positives)
+and (chr(n) not in neutrals)])
+stackLeft       = [None]    # stack with each entry containing five
+stackRight      = [None]    # .. elements;  note that entry zero is not
+stackLeftScore  = [None]    # .. used
+stackRightScore = [None]
+stackLower      = [None]
+top   = 0
+score = 0
+for ix in range(len(seq)):
+ch = seq[ix]
+if (ch in blockers):
+# emit intervals
+for sp in range(1,top+1):
+left  = stackLeft [sp] + 1
+right = stackRight[sp]
+while (left < right) and (seq[left]  not in positives): left  += 1
+while (right > left) and (seq[right] not in positives): right -= 1
+right += 1
+if (minLength == None) or (right - left >= minLength):
+yield (left,right)
+#empty stack
+stackLeft       = [None]
+stackRight      = [None]
+stackLeftScore  = [None]
+stackRightScore = [None]
+stackLower      = [None]
+top   = 0
+score = 0
+continue
+if   (ch in positives): weight = 1-rate
+elif (ch in neutrals):  weight = -rate
+else: raise ValueError
+score += weight
+#if ("algorithm" in debug):
+#    print >>sys.stderr, "%3d: %c %5.2f" % (ix, ch, score),
+if (weight < 0):
+#if ("algorithm" in debug):
+#    print >>sys.stderr
+continue
+if (top > 0) and (stackRight[top] == ix-1):
+# add this site to the interval on top of the stack
+stackRight     [top] = ix
+stackRightScore[top] = score
+#if ("algorithm" in debug):
+#    print >>sys.stderr, \
+#          " extending [%d] %d-%d %4.1f %4.1f" \
+#        % (top,
+#           stackLeft     [top], stackRight     [top],
+#           stackLeftScore[top], stackRightScore[top]),
+else:
+# create a one site interval
+top += 1
+if (top >= len(stackLeft)):
+stackLeft       += [None]
+stackRight      += [None]
+stackLeftScore  += [None]
+stackRightScore += [None]
+stackLower      += [None]
+stackLeft      [top] = ix - 1
+stackLeftScore [top] = score - weight
+stackRight     [top] = ix
+stackRightScore[top] = score
+stackLower     [top] = top - 1
+while (stackLower[top] > 0) \
+and (stackLeftScore[stackLower[top]] > stackLeftScore[top]):
+stackLower[top] = stackLower[stackLower[top]]
+#if ("algorithm" in debug):
+#    print >>sys.stderr, \
+#          " creating  [%d] %d-%d %4.1f %4.1f -> %d" \
+#        % (top,
+#           stackLeft     [top], stackRight     [top],
+#           stackLeftScore[top], stackRightScore[top],
+#           stackLower    [top]),
+# merge intervals;  if there is a previous interval with a no-higher
+# left score and no-higher right score, merge this interval (and all
+# intervening ones) into that one
+while (top > 1) \
+and (stackLower[top] > 0) \
+and (stackRightScore[stackLower[top]] <= stackRightScore[top]):
+stackRight     [stackLower[top]] = stackRight     [top]
+stackRightScore[stackLower[top]] = stackRightScore[top]
+top = stackLower[top]
+#if ("algorithm" in debug):
+#    print >>sys.stderr, \
+#          "\n%*s merging   [%d] %d-%d %4.1f %4.1f" \
+#        % (13, "", top,
+#           stackLeft[top],      stackRight     [top],
+#           stackLeftScore[top], stackRightScore[top]),
+#if ("algorithm" in debug):
+#    print >>sys.stderr
+# emit intervals
+for sp in range(1,top+1):
+left  = stackLeft [sp] + 1
+right = stackRight[sp]
+while (left < right) and (seq[left]  not in positives): left  += 1
+while (right > left) and (seq[right] not in positives): right -= 1
+right += 1
+if (minLength == None) or (right - left >= minLength):
+yield (left,right)
+###################################################################
+# modified by Chen Sun on 7/11/2014
+#
+###################################################################
+# correlation_sequence--
+#    Compute the correlation sequence for a given period.  This is a sequence
+#    of + and - indicating whether the base at a given position matches the one
+#    P positions earlier (where P is the period).  The first P positions are
+#    blank.  Positions with single character runs longer than the period are
+#    considered as non-matches, unless the period is 1.
+def correlation_sequence(sequence,period,start=None,end=None):
+if (start == None): start = 0
+if (end   == None): end   = len(sequence)
+prevCh = sequence[start]
+run    = 1
+for ix in xrange(start+1,start+period):
+ch = sequence[ix]
+if (ch != prevCh): run =  1
+else:              run += 1
+prevCh = ch
+corr = [" "] * period
+for ix in xrange(start+period,end):
+rptCh = sequence[ix-period]
+ch    = sequence[ix]
+if (ch != prevCh): run =  1
+else:              run += 1
+if    (ch    in "ACGT") \
+and (ch == rptCh) \
+and ((period == 1) or (run < period)):
+corr += ["+"]
+else:
+corr += ["-"]
+prevCh = ch
+return "".join(corr)
+# longest_suitable_run--
+#    Find longest run with a good enough rate (of positives).
+#
+#    We score a "+" as 1-r and anything else as -r.  This is based on the fol-
+#    lowing derivation (p is the number of "+"s, n is the number of non-"+"s):
+#        p/(p+n) >= r
+#        ==> p >= rp + rn
+#        ==> (1-r)p - rn >= 0
+#
+#    We adapt an algorithm from "Programming Pearls", pg. 81 (2000 printing).
+#
+# $$$ we use the denominator as the threshold, but we really should use the
+# $$$ .. numerator, comparing it to minLength*rate
+#
+# $$$ this needs to account for $$$ this situation:
+# $$$   sequence: ACGACGACGACGTTATTATTATTA
+# $$$   matches:     +++++++++---+++++++++
+# $$$ this is currently considered to be one interval (if rate <= 6/7), but it
+# $$$ ought to be two;  we can't just post-process, though, because some other
+# $$$ interval might be longer than the longest half of this;  maybe what we
+# $$$ need to do is consider matches at distances -P and -2P, or if we match
+# $$$ -P but that itself was a mismatch, we should carry the mismatch forward
+def longest_suitable_run(seq,minLength,rate):
+maxEndingHere = 0
+maxSoFar      = 0
+start         = None
+for ix in xrange(len(seq)):
+if (seq[ix] == "+"): s = 1-rate
+else:                s = -rate
+if (maxEndingHere+s < 0):
+maxEndingHere = 0
+block         = ix
+else:
+maxEndingHere += s
+if (maxEndingHere >= maxSoFar):
+maxSoFar = maxEndingHere
+start    = block + 1
+end      = ix + 1
+if (start == None) or (end - start < minLength):
+return []
+else:
+return [(start,end)]
+# all_suitable_runs--
+#    Find all non-overlapping runs with a good enough rate (of positives), and
+#    which meet our length threshold.
+# $$$ this needs to post-process the intervals, splitting them to account for
+# $$$ this situation:
+# $$$   sequence: ACGACGACGACGTTATTATTATTA
+# $$$   matches:     +++++++++---+++++++++
+# $$$ this is currently reported as one interval (if rate <= 6/7), but it
+# $$$ ought to be two
+def all_suitable_runs(seq,minCorrLength,rate, hammingThreshold):
+################################################################
+# modified by Chen Sun on 07/11/2014
+#
+################################################################
+if hammingThreshold > 0:
+return [(start,end) for (start,end) in dense_intervals(seq,rate,"+","-",blockers=None,minLength=minCorrLength)]
+elif hammingThreshold == 0:
+return [(start,end) for (start,end) in non_negative_intervals(seq, minLength=minCorrLength)]
+# find_repeat_element--
+#    Find the most plausible repeat element for a run, and nudge the ends of
+#    the run if needed.  Note that we will not consider kmers that represent
+#    shorter repeats.  For example, we won't report ACTACT as a 6-mer since we
+#    consider this to have a shorter period than 6.
+def find_repeat_element(hammingThreshold, period,seq,start,end,allowPartials=False):
+if hammingThreshold > 0:
+(kmer,bestD,bestStart,bestEnd) = find_hamming_repeat_element(period,seq,start,end,allowPartials)
+return (kmer,bestD,bestStart,bestEnd)
+# count the number of occurences of each k-mer;  note that we can't
+# reject kmers containing smaller repeats yet, since for a sequence like
+# ACACACACACAAACACACACACACACACAC we must first discover ACACAC as the best
+# 6-mer, and THEN reject it;  if we reject ACACAC while counting, we'd end
+# up reporting something like ACACAA as the best motif
+if ("element" in debug):
+print >>stderr, "find_repeat_element(%d,%d,%d)" % (period,start,end)
+if ("partial" in debug):
+print period, seq, start, end, allowPartials;
+print seq[start:end]
+kmerToCount = {}
+kmerToFirst = {}
+for ix in xrange(start,end-(period-1)):
+kmer = seq[ix:ix+period]
+if ("N" in kmer): continue
+if (kmer not in kmerToCount):
+kmerToCount[kmer] = 1
+kmerToFirst[kmer] = ix
+else:
+kmerToCount[kmer] += 1
+#if ("element" in debug):
+#    print >>stderr, "    %d: %s" % (ix,kmer)
+# choose the best k-mer;  this is simply the most frequently occurring one,
+# with ties broken by whichever one came first
+kmers = [(-kmerToCount[kmer],kmerToFirst[kmer],kmer) for kmer in kmerToCount]
+if (kmers == []): return (None,None,start,end)
+kmers.sort()
+if ("element" in debug):
+for (count,first,kmer) in kmers:
+print >>stderr, "    %s: %d" % (kmer,-count)
+(count,first,kmer) = kmers[0]
+if (contains_repeat(kmer)): return (None,None,start,end)
+# determine the hamming distance between the run and a simple repeat, for
+# each "plausible" start and end;  we compute the distance for each such
+# interval, and choose the one with the lowest hamming distance;  ties are
+# broken in a deterministic-but-unspecified manner
+bestD = bestStart = bestEnd = None
+###################################################################################
+# modified by Chen Sun(cxs1031@cse.psu.edu) on 10/18/2013
+#     since we do not allow hamming_distance > 0, which means we do not allow mutation,
+# we do not need this section to produce bestStart and End
+###################################################################################
+#for (s,e) in plausible_intervals(start,end,period,len(seq),allowPartials=allowPartials):
+#    d = hamming_distance(seq,s,e,kmer)
+#    if (d == None): continue
+#    if (bestD == None) or (d <= bestD):
+#        (bestD,bestStart,bestEnd) = (d,s,e)
+bestStart = start
+if(allowPartials):
+bestEnd = end
+elif(not allowPartials):
+bestEnd = start
+pattern = seq[start:start+period]
+if ("partial" in debug):
+print "kmer:", kmer
+if(pattern != kmer):
+print "pattern:", pattern
+while(bestEnd <= end-period):
+bestEnd += period
+# bestD will always be 0, as we do not allow mutation
+bestD = 0
+if ("partial" in debug):
+print bestD, bestStart, bestEnd
+###################################################################################
+# modified by Chen Sun(cxs1031@cse.psu.edu) on 10/10
+#
+###################################################################################
+return (kmer,bestD,bestStart,bestEnd)
+def find_hamming_repeat_element(period,seq,start,end,allowPartials=False):
+# count the number of occurences of each k-mer;  note that we can't
+# reject kmers containing smaller repeats yet, since for a sequence like
+# ACACACACACAAACACACACACACACACAC we must first discover ACACAC as the best
+# 6-mer, and THEN reject it;  if we reject ACACAC while counting, we'd end
+# up reporting something like ACACAA as the best motif
+if ("element" in debug):
+print >>stderr, "find_repeat_element(%d,%d,%d)" % (period,start,end)
+kmerToCount = {}
+kmerToFirst = {}
+for ix in xrange(start,end-(period-1)):
+kmer = seq[ix:ix+period]
+if ("N" in kmer): continue
+if (kmer not in kmerToCount):
+kmerToCount[kmer] = 1
+kmerToFirst[kmer] = ix
+else:
+kmerToCount[kmer] += 1
+#if ("element" in debug):
+#    print >>stderr, "    %d: %s" % (ix,kmer)
+# choose the best k-mer;  this is simply the most frequently occurring one,
+# with ties broken by whichever one came first
+kmers = [(-kmerToCount[kmer],kmerToFirst[kmer],kmer) for kmer in kmerToCount]
+if (kmers == []): return (None,None,start,end)
+kmers.sort()
+if ("element" in debug):
+for (count,first,kmer) in kmers:
+print >>stderr, "    %s: %d" % (kmer,-count)
+(count,first,kmer) = kmers[0]
+if (contains_repeat(kmer)): return (None,None,start,end)
+# determine the hamming distance between the run and a simple repeat, for
+# each "plausible" start and end;  we compute the distance for each such
+# interval, and choose the one with the lowest hamming distance;  ties are
+# broken in a deterministic-but-unspecified manner
+bestD = bestStart = bestEnd = None
+for (s,e) in plausible_intervals(start,end,period,len(seq),allowPartials=allowPartials):
+d = hamming_distance(seq,s,e,kmer)
+if (d == None): continue
+if (bestD == None) or (d <= bestD):
+(bestD,bestStart,bestEnd) = (d,s,e)
+return (kmer,bestD,bestStart,bestEnd)
+# plausible_intervals--
+#    Yield all plausible intervals intersecting with a run.  We generate all
+#    starts within P bp of the run's start.  For each of these, we either (a) try
+#    all ends within P bp of run's end, or (b) trim the new interval to a whole
+#    multiple of the period, and report this short interval and the longer
+#    interval with one more period appended.  Case (a) allows partial motifs,
+#    while case (b) only allows whole motifs.
+def plausible_intervals(start,end,period,seqLen,allowPartials=False):
+# generate intervals that allow a partial copy of the motif
+if (allowPartials):
+for candStart in xrange(start-(period-1),start+period):
+if (candStart < 0): continue
+for candEnd in xrange(end-(period-1),end+period):
+if (candEnd > seqLen): continue
+if (candEnd <= candStart+period): continue
+yield (candStart,candEnd)
+# -OR- generate intervals that allow only whole copies of the motif
+else:
+for candStart in xrange(start-(period-1),start+period):
+if (candStart < 0): continue
+candEnd = candStart + ((end-candStart)/period)*period
+yield (candStart,candEnd)
+candEnd += period
+if (candEnd <= seqLen): yield (candStart,candEnd)
+# hamming_distance--
+#    Determine the hamming distance between the run and a simple repeat.
+# $$$ improve this by allowing gaps, and stopping when we reach a threshold
+kmerToDiffs = {}  # (this is used for memo-ization)
+def hamming_distance(seq,start,end,kmer):
+period = len(kmer)
+if (end < start + period): return None
+wholeEnd = start + ((end-start)/period)*period
+if (kmer not in kmerToDiffs):
+kmerToDiffs[kmer] = { kmer:0 }
+d = 0
+for ix in xrange(start,wholeEnd,period):
+qmer = seq[ix:ix+period]    # same size as the kmer motif
+if (qmer in kmerToDiffs[kmer]):
+d += kmerToDiffs[kmer][qmer]
+continue
+diffs = 0
+for iy in xrange(0,period):
+if (qmer[iy] != kmer[iy]): diffs += 1
+kmerToDiffs[kmer][qmer] = diffs
+d += diffs
+if (end > wholeEnd):
+qmer = seq[wholeEnd:end]    # shorter than the kmer motif
+if (qmer in kmerToDiffs[kmer]):
+d += kmerToDiffs[kmer][qmer]
+else:
+diffs = 0
+for iy in xrange(0,len(qmer)):
+if (qmer[iy] != kmer[iy]): diffs += 1
+kmerToDiffs[kmer][qmer] = diffs
+d += diffs
+return d
+# fasta_sequences--
+#    Read the fasta sequences from a file.  Note that we convert to upper case,
+#    and convert any letter other than ACGT to N.
+nonDnaMap = maketrans("BDEFHIJKLMOPQRSUVWXYZ","NNNNNNNNNNNNNNNNNNNNN")
+def fasta_sequences(f):
+seqName = None
+seqNucs = None
+for line in f:
+line = line.strip()
+if (line.startswith(">")):
+if (seqName != None):
+yield (seqName,"".join(seqNucs))
+seqName = sequence_name(line)
+seqNucs = []
+elif (seqName == None):
+assert (False), "first sequence has no header"
+else:
+seqNucs += [line]
+if (seqName != None):
+yield (seqName,"".join(seqNucs).upper().translate(nonDnaMap))
+# fastq_sequences--
+#    Read the fastq sequences from a file.  Note that we convert to upper case,
+#    and convert any letter other than ACGT to N.
+def fastq_sequences(f):
+lineNum = 0
+for line in f:
+lineNum += 1
+line = line.strip()
+if (lineNum % 4 == 1):
+assert (line.startswith("@")), \
+"bad read name at line %d" % lineNum
+seqName = line[1:]
+continue
+if (lineNum % 4 == 2):
+seqNucs = line
+continue
+if (lineNum % 4 == 3):
+assert (line.startswith("+")), \
+"can't understand line %d:\n%s" % (lineNum,line)
+continue
+quals = line
+assert (len(quals) == len(seqNucs)), \
+"length mismatch read vs. qualities at line %d" % lineNum
+yield (seqName,"".join(seqNucs).upper().translate(nonDnaMap),quals)
+assert (lineNum % 4 == 0), \
+"incomplete read at end of file"
+def sam_sequences(f):
+lineNum = 0
+for line in f:
+lineNum += 1
+line = line.strip()
+if line.startswith("@"):
+continue
+columns = line.split("\t")
+seqName = columns[0]
+refName = columns[2]
+pre_s = int(columns[3]) - 1
+cigar = columns[5]
+seqNucs = columns[9]
+yield (seqName,"".join(seqNucs).upper().translate(nonDnaMap), refName, pre_s, cigar)
+# sequence_name--
+#    Extract the sequence name from a fasta header.
+#    $$$ this may need to be improved $$$
+def sequence_name(s):
+s = s[1:].strip()
+if (s == ""): return ""
+else:         return s.split()[0]
+# nucleotide_runs--
+#    Yield (start,end) for all runs of valid nucleotides in a sequence.
+def nucleotide_runs(s):
+runs  = []
+start = None
+for (ix,nuc) in enumerate(s):
+if (nuc in "ACGT"):
+if (start == None):
+start = ix
+else:
+if (start != None):
+yield (start,ix)
+start = None
+if (start != None): yield (start,len(s))
+# contains_repeat--
+#    Determine whether a short sequence contains a repeated element, such as a
+#    6-mer containing a repeated 2-mer (ACACAC) or 3-mer (ACTACT).  The repeat
+#    must cover the entire sequence, without mismatches.
+def contains_repeat(kmer):
+kmerLength = len(kmer)
+hasRepeat = False
+rptLen = 1
+while (not hasRepeat) and (2 * rptLen <= kmerLength):
+if (kmerLength % rptLen != 0):
+rptLen += 1
+continue
+isRepeat = True
+for i in xrange(rptLen,kmerLength,rptLen):
+if (kmer[i:i+rptLen] != kmer[:rptLen]):
+isRepeat = False
+break
+if (isRepeat):
+hasRepeat = True
+break
+rptLen += 1
+return hasRepeat
+# hash108--
+#    Return a 108-bit hash "value" of a string
+def hash108(s):
+m = md5_new()
+m.update(s)
+return m.hexdigest()[:27]
+# float_or_fraction--
+#    Convert a string to a number, allowing fractions
+def float_or_fraction(s):
+if ("/" in s):
+(numer,denom) = s.split("/",1)
+return float(numer)/float(denom)
+else:
+return float(s)
+# int_with_unit--
+#    Parse a string as an integer, allowing unit suffixes
+def int_with_unit(s):
+if (s.endswith("K")):
+multiplier = 1000
+s = s[:-1]
+elif (s.endswith("M")):
+multiplier = 1000 * 1000
+s = s[:-1]
+elif (s.endswith("G")):
+multiplier = 1000 * 1000 * 1000
+s = s[:-1]
+else:
+multiplier = 1
+try:               return               int(s)   * multiplier
+except ValueError: return int(math.ceil(float(s) * multiplier))
+if __name__ == "__main__": main()

Mercurial > repos > arkarachai-fungtammasan > str_fm

comparison microsatellite.py @ 0:07588b899c13 draft