Mercurial > repos > ahosny > cnvsim
comparison cnvsim/genome_simulator.py @ 18:eca72016b5b3 draft default tip
Uploaded
author | ahosny |
---|---|
date | Wed, 07 Sep 2016 09:37:49 -0400 |
parents | 63955244b2fa |
children |
comparison
equal
deleted
inserted
replaced
17:e4ebf3435054 | 18:eca72016b5b3 |
---|---|
51 random.shuffle(cnv_mask) | 51 random.shuffle(cnv_mask) |
52 | 52 |
53 return cnv_mask | 53 return cnv_mask |
54 | 54 |
55 | 55 |
56 def _generateCNVList(chromosome_length, number_of_regions, p_amplify, p_delete, min_variation, max_variation): | 56 def _generateCNVList(chromosome_length, number_of_regions, p_amplify, p_delete, min_variation, max_variation, region_min_length, region_max_length): |
57 ''' | 57 ''' |
58 | 58 This function generates a CNV list to be used for simulating the CNVs in the next step |
59 :param chromosome_length: | 59 :param chromosome_length: the length of the simulated chromosome |
60 :param number_of_regions: | 60 :param number_of_regions: the number of regions to be affected by CNVs |
61 :param p_amplify: | 61 :param p_amplify: percentage of amplification |
62 :param p_delete: | 62 :param p_delete: percentage of deletions |
63 :param min_variation: | 63 :param min_variation: minimum level of variation |
64 :param max_variation: | 64 :param max_variation: maximum level of variation |
65 :return: | 65 :param region_min_length: minimum length of a CNV region |
66 :param region_max_length: maximum length of a CNV region | |
67 :return: CNV list | |
66 ''' | 68 ''' |
67 region_length = chromosome_length / number_of_regions | 69 region_length = chromosome_length / number_of_regions |
68 start = 0 | 70 start = 0 |
69 end = region_length | 71 end = region_length |
70 cnv_list = [] | 72 cnv_list = [] |
72 for i in range(number_of_regions): | 74 for i in range(number_of_regions): |
73 # jump forward start | 75 # jump forward start |
74 jump_start = start + int(random.randrange(int(0.40*region_length), int(0.45*region_length))) | 76 jump_start = start + int(random.randrange(int(0.40*region_length), int(0.45*region_length))) |
75 | 77 |
76 # jump backward end | 78 # jump backward end |
77 jump_end = end - int(random.randrange(int(0.40*region_length), int(0.45*region_length))) | 79 jump_end = jump_start + random.randint(region_min_length, region_max_length) - 1 |
78 | 80 |
79 cnv_list.append([jump_start, jump_end, mask[i]]) | 81 cnv_list.append([jump_start, jump_end, mask[i]]) |
80 start = end | 82 start = end |
81 end += region_length | 83 end += region_length |
82 | 84 |
103 # amplification | 105 # amplification |
104 amplification = genome[start-read_length: start] + sequence * variation + genome[end:end+read_length] | 106 amplification = genome[start-read_length: start] + sequence * variation + genome[end:end+read_length] |
105 cnv_genome.append(amplification) | 107 cnv_genome.append(amplification) |
106 elif variation < 0: | 108 elif variation < 0: |
107 # deletion | 109 # deletion |
108 deletion = genome[start-read_length: start] + sequence * variation + genome[end:end+read_length] | 110 deletion = genome[start-read_length: start] + sequence * abs(variation) + genome[end:end+read_length] |
109 control_genome.append(deletion) | 111 control_genome.append(deletion) |
110 | 112 |
111 return ''.join(control_genome), ''.join(cnv_genome) | 113 return ''.join(control_genome), ''.join(cnv_genome) |
112 | 114 |
113 | 115 |
138 Simulate copy number variations on the passed reference genome based on the given simulation parameters | 140 Simulate copy number variations on the passed reference genome based on the given simulation parameters |
139 :param simulation_parameters: a dictionary containing parameters for simulation | 141 :param simulation_parameters: a dictionary containing parameters for simulation |
140 :param cnv_list_parameters: a dictionary containing parameters for CNV List creation | 142 :param cnv_list_parameters: a dictionary containing parameters for CNV List creation |
141 :return: None | 143 :return: None |
142 ''' | 144 ''' |
143 _log('simulation type: whole genome') | 145 _log('simulation type: whole genome sequencing') |
144 | 146 |
145 # create a temporary directory for intermediate files | 147 # create a temporary directory for intermediate files |
146 if not os.path.exists(simulation_parameters['tmp_dir']): | 148 if not os.path.exists(simulation_parameters['tmp_dir']): |
147 os.makedirs(simulation_parameters['tmp_dir']) | 149 os.makedirs(simulation_parameters['tmp_dir']) |
148 | 150 |
165 header, genome = fileio.readGenome(genome_file) | 167 header, genome = fileio.readGenome(genome_file) |
166 _log("successfully loaded a genome of length " + `len(genome)`) | 168 _log("successfully loaded a genome of length " + `len(genome)`) |
167 | 169 |
168 if simulation_parameters['cnv_list_file'] is None: | 170 if simulation_parameters['cnv_list_file'] is None: |
169 # CNV list file | 171 # CNV list file |
170 cnv_list_file = os.path.join(simulation_parameters['output_dir'], "CNVList.bed") | 172 cnv_list_file = os.path.join(simulation_parameters['output_dir'], "copynumber.bed") |
171 | 173 |
172 _log("generating CNV list ..") | 174 _log("generating CNV list ..") |
173 cnv_list = _generateCNVList(len(genome), cnv_list_parameters['regions_count'], \ | 175 cnv_list = _generateCNVList(len(genome), cnv_list_parameters['regions_count'], \ |
174 cnv_list_parameters['amplifications'], cnv_list_parameters['deletions'], \ | 176 cnv_list_parameters['amplifications'], cnv_list_parameters['deletions'], \ |
175 cnv_list_parameters['minimum_variations'], \ | 177 cnv_list_parameters['minimum_variations'], \ |
176 cnv_list_parameters['maximum_variations']) | 178 cnv_list_parameters['maximum_variations'], \ |
179 cnv_list_parameters['minimum_length'], \ | |
180 cnv_list_parameters['maximum_length']) | |
177 cnv_list = map(lambda l: [header.replace('>', '')] + l, cnv_list) | 181 cnv_list = map(lambda l: [header.replace('>', '')] + l, cnv_list) |
178 | 182 |
179 with open(cnv_list_file, 'w') as f: | 183 with open(cnv_list_file, 'w') as f: |
184 line = 'chrom\tchr_start\tchrom_stop\tnum_positions\tcopy_number\n' | |
185 f.write(line) | |
180 for i, cnv_region in enumerate(cnv_list): | 186 for i, cnv_region in enumerate(cnv_list): |
187 num_positions = cnv_region[2] - cnv_region[1] + 1 | |
181 line = cnv_region[0] + '\t' \ | 188 line = cnv_region[0] + '\t' \ |
182 + `cnv_region[1]` + '\t' \ | 189 + `cnv_region[1]` + '\t' \ |
183 + `cnv_region[2]` + '\t' \ | 190 + `cnv_region[2]` + '\t' \ |
191 + `num_positions` + '\t' \ | |
184 + `cnv_region[3]` + '\n' | 192 + `cnv_region[3]` + '\n' |
185 f.write(line) | 193 f.write(line) |
186 _log("randomly generated CNV list saved to " + cnv_list_file) | 194 _log("randomly generated CNV list saved to " + cnv_list_file) |
187 | 195 |
188 else: | 196 else: |
189 _log("loading CNV list ..") | 197 _log("loading CNV list ..") |
190 with open(simulation_parameters['cnv_list_file'], "r") as f: | 198 with open(simulation_parameters['cnv_list_file'], "r") as f: |
191 cnv_list = [] | 199 cnv_list = [] |
192 lines = f.readlines() | 200 lines = f.readlines() |
201 lines.pop(0) | |
193 for line in lines: | 202 for line in lines: |
194 chromosome, region_start, region_end, variation = line.strip().split("\t") | 203 chromosome, region_start, region_end, num_positions, variation = line.strip().split("\t") |
195 cnv_list.append((chromosome, int(region_start), int(region_end), int(variation))) | 204 cnv_list.append((chromosome, int(region_start), int(region_end), int(variation))) |
196 | 205 |
197 _log("successfully loaded CNV list that contains " + `len(lines)` + " regions ..") | 206 _log("successfully loaded CNV list that contains " + `len(lines)` + " regions ..") |
198 | 207 |
199 # call ART to generate reads from the genome file | 208 # call ART to generate reads from the genome file |