Mercurial > repos > jason-ellul > calculatezprimefactor
diff test-data/generate_reads.py @ 5:d7efb5d5352a draft default tip
Uploaded
author | jason-ellul |
---|---|
date | Wed, 01 Jun 2016 02:40:16 -0400 |
parents | 796653c6376b |
children |
line wrap: on
line diff
--- a/test-data/generate_reads.py Wed Jun 01 02:36:11 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,118 +0,0 @@ -#!/usr/bin/env python - - -import random -import math - - -class Region: - def __init__(self,start,stop,sequence): - self.start = start - self.stop = stop - self.sequence = sequence.strip().replace("\n","").replace(" ","") - if(len(self.sequence) != self.getSpanningLength()): - print "ERROR: sequence length: "+str(len(self.sequence))+", while spanning region is: "+str(self.getSpanningLength()) - import sys - sys.exit() - - def getSpanningLength(self): - return abs(self.stop-self.start+1) - -class ReadSynthesizer: - def __init__(self,chromosome): - self.regions = [] - self.chromosome = chromosome - - def addRegion(self,region): - self.regions.append(region) - - def produceReads(self,readDensity = 1,read_length = 50): - """ - Produces uniform reads by walking iteratively over self.regions - """ - - mRNA = self.getTotalmRNA() - spanning_length = self.getRegionSpanningLength() - n = spanning_length['total'] - read_length + 1 - - j = 0 - k = 0 - - for i in range(n): - # "alpha is playing the role of k and beta is playing the role of theta" - dd = max(0,int(round(random.lognormvariate(math.log(readDensity),0.5))))# Notice this is NOT a binomial distribution!! - - for d in range(dd): - sequence = mRNA[i:i+read_length] - - if(random.randint(0,1) == 0): - strand = 0 - else: - strand = 16 - flag = strand + 0 - - print "read_"+str(j)+"."+str(i)+"."+str(d)+"\t"+str(flag)+"\t"+self.chromosome+"\t"+str(self.regions[j].start + k)+"\t60\t"+self.getMappingString(read_length,j,k)+"\t*\t0\t0\t"+str(sequence.upper())+"\t*" - - spanning_length['iter'][j] -= 1 - if(k >= self.regions[j].getSpanningLength()-1): - j += 1 - k = 0 - else: - k += 1 - - def getMappingString(self,length,j,offset): - m = 0 - - out = "" - - for i in range(length): - k = i + offset - - if(k >= self.regions[j].getSpanningLength()): - j += 1 - - out += str(m)+"M" - out += (str(self.regions[j].start - self.regions[j-1].stop-1))+"N" - m = 1 - - offset = -k - else: - m += 1 - - out += str(m) + "M" - - - return out - - def getRegionSpanningLength(self): - length = {'total':0,'iter':[]} - for r in self.regions: - l = r.getSpanningLength() - length['iter'].append(l) - length['total'] += l - return length - - def getTotalmRNA(self): - mRNA = "" - for r in self.regions: - mRNA += r.sequence - return mRNA - -#rs = ReadSynthesizer('chr6') -#rs.addRegion(Region(100,149,'ccaggactggtttctgtaagaaacagcaggagctgtggcagcggcgaaag')) -#rs.addRegion(Region(151,152,'at')) -#rs.produceReads(3,50) - - -rs = ReadSynthesizer('chr6') -rs.addRegion(Region(154360546,154360969,'ccaggactggtttctgtaagaaacagcaggagctgtggcagcggcgaaaggaagcggctgaggcgcttggaacccgaaaagtctcggtgctcctggctacctcgcacagcggtgcccgcccggccgtcagtaccatggacagcagcgctgcccccacgaacgccagcaattgcactgatgccttggcgtactcaagttgctccccagcacccagccccggttcctgggtcaacttgtcccacttagatggcGacctgtccgacccatgcggtccgaaccgcaccgacctgggcgggagagacagcctgtgccctccgaccggcagtccctccatgatcacggccatcacgatcatggccctctactccatcgtgtgcgtggtggggctcttcggaaacttcctggtcatgtatgtgattgtcag')) -rs.addRegion(Region(154410961,154411313,'atacaccaagatgaagactgccaccaacatctacattttcaaccttgctctggcagatgccttagccaccagtaccctgcccttccagagtgtgaattacctaatgggaacatggccatttggaaccatcctttgcaagatagtgatctccatagattactataacatgttcaccagcatattcaccctctgcaccatgagtgttgatcgatacattgcagtctgccaccctgtcaaggccttagatttccgtactccccgaaatgccaaaattatcaatgtctgcaactggatcctctcttcagccattggtcttcctgtaatgttcatggctacaacaaaatacaggcaag')) -rs.addRegion(Region(154412087,154412607,'gttccatagattgtacactaacattctctcatccaacctggtactgggaaaacctgctgaagatctgtgttttcatcttcgccttcattatgccagtgctcatcattaccgtgtgctatggactgatgatcttgcgcctcaagagtgtccgcatgctctctggctccaaagaaaaggacaggaatcttcgaaggatcaccaggatggtgctggtggtggtggctgtgttcatcgtctgctggactcccattcacatttacgtcatcattaaagccttggttacaatcccagaaactacgttccagactgtttcttggcacttctgcattgctctaggttacacaaacagctgcctcaacccagtcctttatgcatttctggatgaaaacttcaaacgatgcttcagagagttctgtatcccaacctcttccaacattgagcaacaaaactccactcgaattcgtcagaacactagagaccacccctccacggccaatacagtggatagaactaatcatcag')) -rs.addRegion(Region(154428600,154428787,'gtggaattgaacctggactgtcactgtgaaaatgcaaagccttggccactgagctacaatgcagggcagtctccatttcccttcccaggaagagtctagagcattaattttgagtttgcaaaggcttgtaactatttcatatgatttttagagctgactatgacatgaaccctaaaattcctgttccc')) -rs.produceReads(3,50) - - - - - -