comparison cnvsim/genome_simulator.py @ 18:eca72016b5b3 draft default tip

Uploaded
author ahosny
date Wed, 07 Sep 2016 09:37:49 -0400
parents 63955244b2fa
children
comparison
equal deleted inserted replaced
17:e4ebf3435054 18:eca72016b5b3
51 random.shuffle(cnv_mask) 51 random.shuffle(cnv_mask)
52 52
53 return cnv_mask 53 return cnv_mask
54 54
55 55
56 def _generateCNVList(chromosome_length, number_of_regions, p_amplify, p_delete, min_variation, max_variation): 56 def _generateCNVList(chromosome_length, number_of_regions, p_amplify, p_delete, min_variation, max_variation, region_min_length, region_max_length):
57 ''' 57 '''
58 58 This function generates a CNV list to be used for simulating the CNVs in the next step
59 :param chromosome_length: 59 :param chromosome_length: the length of the simulated chromosome
60 :param number_of_regions: 60 :param number_of_regions: the number of regions to be affected by CNVs
61 :param p_amplify: 61 :param p_amplify: percentage of amplification
62 :param p_delete: 62 :param p_delete: percentage of deletions
63 :param min_variation: 63 :param min_variation: minimum level of variation
64 :param max_variation: 64 :param max_variation: maximum level of variation
65 :return: 65 :param region_min_length: minimum length of a CNV region
66 :param region_max_length: maximum length of a CNV region
67 :return: CNV list
66 ''' 68 '''
67 region_length = chromosome_length / number_of_regions 69 region_length = chromosome_length / number_of_regions
68 start = 0 70 start = 0
69 end = region_length 71 end = region_length
70 cnv_list = [] 72 cnv_list = []
72 for i in range(number_of_regions): 74 for i in range(number_of_regions):
73 # jump forward start 75 # jump forward start
74 jump_start = start + int(random.randrange(int(0.40*region_length), int(0.45*region_length))) 76 jump_start = start + int(random.randrange(int(0.40*region_length), int(0.45*region_length)))
75 77
76 # jump backward end 78 # jump backward end
77 jump_end = end - int(random.randrange(int(0.40*region_length), int(0.45*region_length))) 79 jump_end = jump_start + random.randint(region_min_length, region_max_length) - 1
78 80
79 cnv_list.append([jump_start, jump_end, mask[i]]) 81 cnv_list.append([jump_start, jump_end, mask[i]])
80 start = end 82 start = end
81 end += region_length 83 end += region_length
82 84
103 # amplification 105 # amplification
104 amplification = genome[start-read_length: start] + sequence * variation + genome[end:end+read_length] 106 amplification = genome[start-read_length: start] + sequence * variation + genome[end:end+read_length]
105 cnv_genome.append(amplification) 107 cnv_genome.append(amplification)
106 elif variation < 0: 108 elif variation < 0:
107 # deletion 109 # deletion
108 deletion = genome[start-read_length: start] + sequence * variation + genome[end:end+read_length] 110 deletion = genome[start-read_length: start] + sequence * abs(variation) + genome[end:end+read_length]
109 control_genome.append(deletion) 111 control_genome.append(deletion)
110 112
111 return ''.join(control_genome), ''.join(cnv_genome) 113 return ''.join(control_genome), ''.join(cnv_genome)
112 114
113 115
138 Simulate copy number variations on the passed reference genome based on the given simulation parameters 140 Simulate copy number variations on the passed reference genome based on the given simulation parameters
139 :param simulation_parameters: a dictionary containing parameters for simulation 141 :param simulation_parameters: a dictionary containing parameters for simulation
140 :param cnv_list_parameters: a dictionary containing parameters for CNV List creation 142 :param cnv_list_parameters: a dictionary containing parameters for CNV List creation
141 :return: None 143 :return: None
142 ''' 144 '''
143 _log('simulation type: whole genome') 145 _log('simulation type: whole genome sequencing')
144 146
145 # create a temporary directory for intermediate files 147 # create a temporary directory for intermediate files
146 if not os.path.exists(simulation_parameters['tmp_dir']): 148 if not os.path.exists(simulation_parameters['tmp_dir']):
147 os.makedirs(simulation_parameters['tmp_dir']) 149 os.makedirs(simulation_parameters['tmp_dir'])
148 150
165 header, genome = fileio.readGenome(genome_file) 167 header, genome = fileio.readGenome(genome_file)
166 _log("successfully loaded a genome of length " + `len(genome)`) 168 _log("successfully loaded a genome of length " + `len(genome)`)
167 169
168 if simulation_parameters['cnv_list_file'] is None: 170 if simulation_parameters['cnv_list_file'] is None:
169 # CNV list file 171 # CNV list file
170 cnv_list_file = os.path.join(simulation_parameters['output_dir'], "CNVList.bed") 172 cnv_list_file = os.path.join(simulation_parameters['output_dir'], "copynumber.bed")
171 173
172 _log("generating CNV list ..") 174 _log("generating CNV list ..")
173 cnv_list = _generateCNVList(len(genome), cnv_list_parameters['regions_count'], \ 175 cnv_list = _generateCNVList(len(genome), cnv_list_parameters['regions_count'], \
174 cnv_list_parameters['amplifications'], cnv_list_parameters['deletions'], \ 176 cnv_list_parameters['amplifications'], cnv_list_parameters['deletions'], \
175 cnv_list_parameters['minimum_variations'], \ 177 cnv_list_parameters['minimum_variations'], \
176 cnv_list_parameters['maximum_variations']) 178 cnv_list_parameters['maximum_variations'], \
179 cnv_list_parameters['minimum_length'], \
180 cnv_list_parameters['maximum_length'])
177 cnv_list = map(lambda l: [header.replace('>', '')] + l, cnv_list) 181 cnv_list = map(lambda l: [header.replace('>', '')] + l, cnv_list)
178 182
179 with open(cnv_list_file, 'w') as f: 183 with open(cnv_list_file, 'w') as f:
184 line = 'chrom\tchr_start\tchrom_stop\tnum_positions\tcopy_number\n'
185 f.write(line)
180 for i, cnv_region in enumerate(cnv_list): 186 for i, cnv_region in enumerate(cnv_list):
187 num_positions = cnv_region[2] - cnv_region[1] + 1
181 line = cnv_region[0] + '\t' \ 188 line = cnv_region[0] + '\t' \
182 + `cnv_region[1]` + '\t' \ 189 + `cnv_region[1]` + '\t' \
183 + `cnv_region[2]` + '\t' \ 190 + `cnv_region[2]` + '\t' \
191 + `num_positions` + '\t' \
184 + `cnv_region[3]` + '\n' 192 + `cnv_region[3]` + '\n'
185 f.write(line) 193 f.write(line)
186 _log("randomly generated CNV list saved to " + cnv_list_file) 194 _log("randomly generated CNV list saved to " + cnv_list_file)
187 195
188 else: 196 else:
189 _log("loading CNV list ..") 197 _log("loading CNV list ..")
190 with open(simulation_parameters['cnv_list_file'], "r") as f: 198 with open(simulation_parameters['cnv_list_file'], "r") as f:
191 cnv_list = [] 199 cnv_list = []
192 lines = f.readlines() 200 lines = f.readlines()
201 lines.pop(0)
193 for line in lines: 202 for line in lines:
194 chromosome, region_start, region_end, variation = line.strip().split("\t") 203 chromosome, region_start, region_end, num_positions, variation = line.strip().split("\t")
195 cnv_list.append((chromosome, int(region_start), int(region_end), int(variation))) 204 cnv_list.append((chromosome, int(region_start), int(region_end), int(variation)))
196 205
197 _log("successfully loaded CNV list that contains " + `len(lines)` + " regions ..") 206 _log("successfully loaded CNV list that contains " + `len(lines)` + " regions ..")
198 207
199 # call ART to generate reads from the genome file 208 # call ART to generate reads from the genome file