5
|
1 __author__ = 'Abdelrahman Hosny'
|
|
2
|
|
3 import random
|
|
4 import os
|
|
5 import subprocess
|
|
6 import datetime
|
|
7 import shutil
|
|
8
|
|
9 from . import fileio
|
|
10
|
|
11
|
|
12 def _log(message):
|
|
13 print '[CNV SIM {:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()) + "] " + message
|
|
14
|
|
15
|
|
16 def getScriptPath():
|
|
17 return os.path.dirname(os.path.realpath(__file__))
|
|
18
|
|
19
|
|
20 def _generateCNVMask(mask_length, p_amplify, p_delete, min_variation, max_variation):
|
|
21 '''
|
|
22 This function generates random Copy Number Variations mask list
|
|
23 :param exons: list of regions.
|
|
24 :param p_amplify: percentage of amplifications introduced
|
|
25 :param p_delete: percentage of deletions introduced
|
|
26 :return: a list of the same length as the exons list. each list item
|
|
27 indicates the variation to be added to the exon in the same position.
|
|
28 Positive number: number of amplification
|
|
29 Zero: no change
|
|
30 -1: delete this exon
|
|
31 '''
|
|
32
|
|
33 number_of_amplifications = int(p_amplify * mask_length)
|
|
34 number_of_deletions = int(p_delete * mask_length)
|
|
35 cnv_mask = [0] * mask_length
|
|
36
|
|
37 # generate CNV mask (a list of amplifications and deletions to be applied to the exons)
|
|
38 while number_of_amplifications > 0:
|
|
39 choice = random.randrange(0, mask_length)
|
|
40 while cnv_mask[choice] != 0:
|
|
41 choice = random.randrange(0, mask_length)
|
|
42 cnv_mask[choice] = random.randrange(min_variation, max_variation) # random amplifications in the range [min_variation, max_variation)
|
|
43 number_of_amplifications -= 1
|
|
44 random.shuffle(cnv_mask)
|
|
45 while number_of_deletions > 0:
|
|
46 choice = random.randrange(0, mask_length)
|
|
47 while cnv_mask[choice] != 0:
|
|
48 choice = random.randrange(0, mask_length)
|
|
49 cnv_mask[choice] = -1*random.randrange(min_variation, max_variation) # random deletions in the range [min_variation, max_variation)
|
|
50 number_of_deletions -= 1
|
|
51 random.shuffle(cnv_mask)
|
|
52
|
|
53 return cnv_mask
|
|
54
|
|
55
|
18
|
56 def _generateCNVList(chromosome_length, number_of_regions, p_amplify, p_delete, min_variation, max_variation, region_min_length, region_max_length):
|
5
|
57 '''
|
18
|
58 This function generates a CNV list to be used for simulating the CNVs in the next step
|
|
59 :param chromosome_length: the length of the simulated chromosome
|
|
60 :param number_of_regions: the number of regions to be affected by CNVs
|
|
61 :param p_amplify: percentage of amplification
|
|
62 :param p_delete: percentage of deletions
|
|
63 :param min_variation: minimum level of variation
|
|
64 :param max_variation: maximum level of variation
|
|
65 :param region_min_length: minimum length of a CNV region
|
|
66 :param region_max_length: maximum length of a CNV region
|
|
67 :return: CNV list
|
5
|
68 '''
|
|
69 region_length = chromosome_length / number_of_regions
|
|
70 start = 0
|
|
71 end = region_length
|
|
72 cnv_list = []
|
|
73 mask = _generateCNVMask(number_of_regions, p_amplify, p_delete, min_variation, max_variation)
|
|
74 for i in range(number_of_regions):
|
|
75 # jump forward start
|
|
76 jump_start = start + int(random.randrange(int(0.40*region_length), int(0.45*region_length)))
|
|
77
|
|
78 # jump backward end
|
18
|
79 jump_end = jump_start + random.randint(region_min_length, region_max_length) - 1
|
5
|
80
|
|
81 cnv_list.append([jump_start, jump_end, mask[i]])
|
|
82 start = end
|
|
83 end += region_length
|
|
84
|
|
85 return cnv_list
|
|
86
|
|
87
|
|
88 def _simulateCNV(genome, cnv_list, read_length):
|
|
89 '''
|
|
90 Simulate the control genome and the CNV genome
|
|
91 :param genome: original genome sequence
|
|
92 :param cnv_list: a list of region variations (chromosome, start, end, variation)
|
|
93 :return: control_genome, cnv_genome
|
|
94 '''
|
|
95 control_genome = []
|
|
96 cnv_genome = []
|
|
97
|
|
98 for cnv in cnv_list:
|
|
99 start = cnv[1]
|
|
100 end = cnv[2]
|
|
101 variation = cnv[3]
|
|
102 sequence = genome[start: end]
|
|
103
|
|
104 if variation > 0:
|
|
105 # amplification
|
|
106 amplification = genome[start-read_length: start] + sequence * variation + genome[end:end+read_length]
|
|
107 cnv_genome.append(amplification)
|
|
108 elif variation < 0:
|
|
109 # deletion
|
18
|
110 deletion = genome[start-read_length: start] + sequence * abs(variation) + genome[end:end+read_length]
|
5
|
111 control_genome.append(deletion)
|
|
112
|
|
113 return ''.join(control_genome), ''.join(cnv_genome)
|
|
114
|
|
115
|
|
116 def _callART(genome_file, output_file, read_length, fold_coverage=1):
|
|
117 '''
|
|
118 Calls Wessim to generate artificial reads for the targets on the reference genome
|
|
119 :param genome_file: reference genome file in FASTA format
|
|
120 :param output_file: output file name
|
|
121 :param read_length: the read length
|
|
122 :param fold_coverage: fold coverage for the reads
|
|
123 :return: None
|
|
124 '''
|
|
125 os.chdir(os.path.join(getScriptPath(), "ART"))
|
|
126 subprocess.call(["./art_illumina", \
|
|
127 "-sam", \
|
|
128 "-i", genome_file, \
|
|
129 "-p", \
|
|
130 "-m", "200", \
|
|
131 "-s", "10", \
|
|
132 "-l", str(read_length), \
|
|
133 "-f", str(fold_coverage), \
|
|
134 "-o", output_file], stderr=None)
|
|
135 os.chdir("..")
|
|
136
|
|
137
|
|
138 def simulate_genome_cnv(simulation_parameters, cnv_list_parameters=None):
|
|
139 '''
|
|
140 Simulate copy number variations on the passed reference genome based on the given simulation parameters
|
|
141 :param simulation_parameters: a dictionary containing parameters for simulation
|
|
142 :param cnv_list_parameters: a dictionary containing parameters for CNV List creation
|
|
143 :return: None
|
|
144 '''
|
18
|
145 _log('simulation type: whole genome sequencing')
|
5
|
146
|
|
147 # create a temporary directory for intermediate files
|
|
148 if not os.path.exists(simulation_parameters['tmp_dir']):
|
|
149 os.makedirs(simulation_parameters['tmp_dir'])
|
|
150
|
|
151 # create output directory for final results
|
|
152 if not os.path.exists(simulation_parameters['output_dir']):
|
|
153 os.makedirs(simulation_parameters['output_dir'])
|
|
154
|
|
155 # copy genome to the tmp folder
|
|
156 shutil.copyfile(simulation_parameters['genome_file'], os.path.join(simulation_parameters['tmp_dir'], "reference.fa"))
|
|
157 genome_file = os.path.join(simulation_parameters['tmp_dir'], "reference.fa")
|
|
158
|
|
159 # initialize variables for temporary files
|
|
160 control_genome_file = os.path.join(simulation_parameters['tmp_dir'], "ControlGenome.fa")
|
|
161 cnv_genome_file = os.path.join(simulation_parameters['tmp_dir'], "CNVGenome.fa")
|
|
162 base_reads_file = os.path.join(simulation_parameters['tmp_dir'], "base")
|
|
163 control_reads_file = os.path.join(simulation_parameters['tmp_dir'], "control")
|
|
164 cnv_reads_file = os.path.join(simulation_parameters['tmp_dir'], "cnv")
|
|
165
|
|
166 _log("loading genome file ..")
|
|
167 header, genome = fileio.readGenome(genome_file)
|
|
168 _log("successfully loaded a genome of length " + `len(genome)`)
|
|
169
|
|
170 if simulation_parameters['cnv_list_file'] is None:
|
|
171 # CNV list file
|
18
|
172 cnv_list_file = os.path.join(simulation_parameters['output_dir'], "copynumber.bed")
|
5
|
173
|
|
174 _log("generating CNV list ..")
|
|
175 cnv_list = _generateCNVList(len(genome), cnv_list_parameters['regions_count'], \
|
|
176 cnv_list_parameters['amplifications'], cnv_list_parameters['deletions'], \
|
|
177 cnv_list_parameters['minimum_variations'], \
|
18
|
178 cnv_list_parameters['maximum_variations'], \
|
|
179 cnv_list_parameters['minimum_length'], \
|
|
180 cnv_list_parameters['maximum_length'])
|
5
|
181 cnv_list = map(lambda l: [header.replace('>', '')] + l, cnv_list)
|
|
182
|
|
183 with open(cnv_list_file, 'w') as f:
|
18
|
184 line = 'chrom\tchr_start\tchrom_stop\tnum_positions\tcopy_number\n'
|
|
185 f.write(line)
|
5
|
186 for i, cnv_region in enumerate(cnv_list):
|
18
|
187 num_positions = cnv_region[2] - cnv_region[1] + 1
|
5
|
188 line = cnv_region[0] + '\t' \
|
|
189 + `cnv_region[1]` + '\t' \
|
|
190 + `cnv_region[2]` + '\t' \
|
18
|
191 + `num_positions` + '\t' \
|
5
|
192 + `cnv_region[3]` + '\n'
|
|
193 f.write(line)
|
|
194 _log("randomly generated CNV list saved to " + cnv_list_file)
|
|
195
|
|
196 else:
|
|
197 _log("loading CNV list ..")
|
|
198 with open(simulation_parameters['cnv_list_file'], "r") as f:
|
|
199 cnv_list = []
|
|
200 lines = f.readlines()
|
18
|
201 lines.pop(0)
|
5
|
202 for line in lines:
|
18
|
203 chromosome, region_start, region_end, num_positions, variation = line.strip().split("\t")
|
5
|
204 cnv_list.append((chromosome, int(region_start), int(region_end), int(variation)))
|
|
205
|
|
206 _log("successfully loaded CNV list that contains " + `len(lines)` + " regions ..")
|
|
207
|
|
208 # call ART to generate reads from the genome file
|
|
209 _log("generating reads for the genome ..")
|
|
210 _log("delegating job to ART ...")
|
|
211 _callART(genome_file, base_reads_file, simulation_parameters['read_length'])
|
|
212
|
|
213 _log("simulating copy number variations (amplifications/deletions)")
|
|
214 control_genome, cnv_genome = _simulateCNV(genome, cnv_list, simulation_parameters['read_length'])
|
|
215
|
|
216 _log("saving to the control genome file ..")
|
|
217
|
|
218 with open(control_genome_file, 'w') as fw:
|
|
219 fw.write(header + "\n")
|
|
220 n = 50
|
|
221 l = [control_genome[i:i + n] for i in range(0, len(control_genome), n)]
|
|
222 for line in l:
|
|
223 fw.write(line + "\n")
|
|
224
|
|
225 _log("delegating job to ART ...")
|
|
226 _callART(control_genome_file, control_reads_file, simulation_parameters['read_length'])
|
|
227
|
|
228 _log("saving to the CNV genome file ..")
|
|
229 with open(cnv_genome_file, 'w') as fw:
|
|
230 fw.write(header + "\n")
|
|
231 n = 50
|
|
232 l = [cnv_genome[i:i + n] for i in range(0, len(cnv_genome), n)]
|
|
233 for line in l:
|
|
234 fw.write(line + "\n")
|
|
235
|
|
236 _log("delegating job to ART ...")
|
|
237 _callART(cnv_genome_file, cnv_reads_file, simulation_parameters['read_length'])
|
|
238
|
|
239 _log("merging results ..")
|
|
240 fileio.mergeARTReads(simulation_parameters['tmp_dir'], simulation_parameters['output_dir'])
|
|
241
|
|
242 _log("cleaning temporary files ..")
|
|
243 fileio.clean(simulation_parameters['tmp_dir'])
|
|
244
|
|
245 _log("simulation completed. find results in " + simulation_parameters['output_dir']) |