Mercurial > repos > glogobyte > isoread
comparison mirgene_functions.py @ 5:810e789ffeab draft
Uploaded
| author | glogobyte |
|---|---|
| date | Wed, 13 Oct 2021 16:04:54 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 4:f44185f616bc | 5:810e789ffeab |
|---|---|
| 1 import itertools | |
| 2 import urllib.request | |
| 3 from collections import OrderedDict | |
| 4 import copy | |
| 5 | |
| 6 ######################################################################################################################################################## | |
| 7 | |
| 8 """ Read a file and return it as a list """ | |
| 9 | |
| 10 def read(path, flag): | |
| 11 if flag == 0: | |
| 12 with open(path) as fp: | |
| 13 file=fp.readlines() | |
| 14 fp.close() | |
| 15 return file | |
| 16 | |
| 17 if flag == 1: | |
| 18 with open(path) as fp: | |
| 19 file = fp.read().splitlines() | |
| 20 fp.close() | |
| 21 return file | |
| 22 | |
| 23 # Write a list to a txt file | |
| 24 def write(path, list): | |
| 25 with open(path,'w') as fp: | |
| 26 for x in list: | |
| 27 fp.write(str("\t".join(x[1:-1]))) | |
| 28 fp.close() | |
| 29 | |
| 30 ######################################################################################################################################################## | |
| 31 | |
| 32 """ Detect the longest common substring sequence between two mirnas """ | |
| 33 | |
| 34 def longestSubstring(str1, str2): | |
| 35 | |
| 36 from difflib import SequenceMatcher | |
| 37 # initialize SequenceMatcher object with | |
| 38 # input string | |
| 39 seqMatch = SequenceMatcher(None, str1, str2) | |
| 40 | |
| 41 # find match of longest sub-string | |
| 42 # output will be like Match(a=0, b=0, size=5) | |
| 43 match = seqMatch.find_longest_match(0, len(str1), 0, len(str2)) | |
| 44 | |
| 45 # print longest substring | |
| 46 if (match.size != 0): | |
| 47 return str1[match.a: match.a + match.size] | |
| 48 else: | |
| 49 print('No longest common sub-string found') | |
| 50 | |
| 51 | |
| 52 ################################################################################################################################################################# | |
| 53 | |
| 54 """ | |
| 55 | |
| 56 Read the sam files from alignment tool and do the followings: | |
| 57 | |
| 58 1) Keep mapped reads | |
| 59 2) Keep all sequences with length between 18 and 26 nucleotides | |
| 60 3) Detects the ref and templated miRNAs | |
| 61 4) Gives names to templated miRNAs based on ref miRNAs | |
| 62 | |
| 63 """ | |
| 64 | |
| 65 def sam_edit(mature_mirnas,path,file,case,l,samples,data,file_order,unmap_seq,names_n_seqs,deseq,mirna_names,ini_sample,unmap_counts): | |
| 66 | |
| 67 # read the sam file | |
| 68 ini_sam=read(path,0) | |
| 69 main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] | |
| 70 unique_seq = [x for x in main_sam if x[1] == '0' and len(x[9])>=18 and len(x[9])<=26] | |
| 71 filter_sam = [[x[0],x[1],x[2],len(x[9])] for x in main_sam] | |
| 72 sorted_uni_arms = [] | |
| 73 | |
| 74 # Detection of differences between the canonical miRNA and the detected miRNA | |
| 75 for i in range(len(mature_mirnas)): | |
| 76 tmp_count_reads = 0 # calculate the total number of reads | |
| 77 tmp_count_seq = 0 # calculate the total number of sequences | |
| 78 for j in range(len(unique_seq)): | |
| 79 | |
| 80 if mature_mirnas[i] == unique_seq[j][2]: | |
| 81 | |
| 82 temp_mature = mature_mirnas[i+1] | |
| 83 off_part = longestSubstring(temp_mature, unique_seq[j][9]) | |
| 84 | |
| 85 mat_diff = temp_mature.split(off_part) | |
| 86 mat_diff = [len(mat_diff[0]), len(mat_diff[1])] | |
| 87 | |
| 88 unique_diff = unique_seq[j][9].split(off_part) | |
| 89 unique_diff = [len(unique_diff[0]), len(unique_diff[1])] | |
| 90 | |
| 91 # Handling of some special mirnas like (hsa-miR-8485) | |
| 92 if mat_diff[1]!=0 and unique_diff[1]!=0: | |
| 93 unique_seq[j]=1 | |
| 94 pre_pos = 0 | |
| 95 post_pos = 0 | |
| 96 | |
| 97 elif mat_diff[0]!=0 and unique_diff[0]!=0: | |
| 98 unique_seq[j]=1 | |
| 99 pre_pos = 0 | |
| 100 post_pos = 0 | |
| 101 | |
| 102 else: | |
| 103 # Keep the findings | |
| 104 pre_pos = mat_diff[0]-unique_diff[0] | |
| 105 post_pos = unique_diff[1]-mat_diff[1] | |
| 106 tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1]) | |
| 107 tmp_count_seq = tmp_count_seq+1 | |
| 108 | |
| 109 # Store the detected miRNAs with new names according to the findings | |
| 110 if pre_pos != 0 or post_pos != 0: | |
| 111 if pre_pos == 0: | |
| 112 unique_seq[j][2] = unique_seq[j][2] + "_t_" +str(pre_pos) + "_" + '{:+d}'.format(post_pos) | |
| 113 elif post_pos == 0: | |
| 114 unique_seq[j][2] = unique_seq[j][2] + "_t_" + '{:+d}'.format(pre_pos) + "_" + str(post_pos) | |
| 115 else: | |
| 116 unique_seq[j][2] = unique_seq[j][2]+"_t_"+'{:+d}'.format(pre_pos)+"_"+'{:+d}'.format(post_pos) | |
| 117 | |
| 118 # Remove the values "1" from the handling of special mirnas (hsa-miR-8485) | |
| 119 for x in range(unique_seq.count(1)): | |
| 120 unique_seq.remove(1) | |
| 121 | |
| 122 # metrics for the production of database | |
| 123 if tmp_count_reads != 0 and tmp_count_seq != 0: | |
| 124 sorted_uni_arms.append([mature_mirnas[i], tmp_count_seq, tmp_count_reads]) | |
| 125 | |
| 126 # Sorting of the metrics for database | |
| 127 sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True) | |
| 128 | |
| 129 # Correction of metrics due to the collapsing and removing of duplicates for the production of Database | |
| 130 for y in sorted_uni_arms: | |
| 131 counts=0 | |
| 132 seqs=0 | |
| 133 for x in unique_seq: | |
| 134 if y[0]==x[2].split("_")[0]+"_"+x[2].split("_")[1]: | |
| 135 counts+=int(x[0].split("-")[1]) | |
| 136 seqs+=1 | |
| 137 | |
| 138 y[1]=seqs | |
| 139 y[2]=counts | |
| 140 | |
| 141 # Output variables | |
| 142 temp_mirna_names=[] | |
| 143 | |
| 144 l.acquire() | |
| 145 if case == "c" or case == "t": | |
| 146 temp_mirna_names.extend(z[2] for z in unique_seq) | |
| 147 names_n_seqs.extend([[y[2],y[9]] for y in unique_seq]) | |
| 148 deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq]) | |
| 149 mirna_names.extend(temp_mirna_names) | |
| 150 unmap_seq.value += sum([1 for x in main_sam if x[1] == '4']) | |
| 151 unmap_counts.value += sum([int(x[0].split("-")[1]) for x in main_sam if x[1] == '4']) | |
| 152 file_order.append(file) | |
| 153 samples.append(unique_seq) | |
| 154 data.append([case,file,unique_seq,sorted_uni_arms]) | |
| 155 ini_sample.append(filter_sam) | |
| 156 l.release() | |
| 157 | |
| 158 | |
| 159 ###################################################################################################################################### | |
| 160 """ | |
| 161 | |
| 162 Read a sam file from Bowtie and do the followings: | |
| 163 | |
| 164 1) Keep unmapped reads | |
| 165 2) Keep all sequences with length between 18 and 26 nucleotides | |
| 166 3) Detects the non-template isomirs | |
| 167 4) Gives names to isomir's based on ref miRNAs | |
| 168 | |
| 169 """ | |
| 170 | |
| 171 def non_sam_edit(mature_mirnas,path,file,case,l,data,file_order,n_deseq,names_n_seqs): | |
| 172 | |
| 173 # read the sam file | |
| 174 ini_sam=read(path,0) | |
| 175 main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] | |
| 176 unique_seq=[] | |
| 177 unique_seq = [x for x in main_sam if x[1] == '4' and len(x[9])>=18 and len(x[9])<=26] | |
| 178 uni_seq=[] | |
| 179 | |
| 180 # Calculate the shifted positions for every isomir and add them to the name of it | |
| 181 sorted_uni_arms = [] | |
| 182 for i in range(1,len(mature_mirnas),2): | |
| 183 tmp_count_reads = 0 # calculate the total number of reads | |
| 184 tmp_count_seq = 0 # calculate the total number of sequences | |
| 185 | |
| 186 for j in range(len(unique_seq)): | |
| 187 | |
| 188 temp_mature = mature_mirnas[i].strip().replace("U", "T") | |
| 189 | |
| 190 # Detection of differences between the canonical miRNA and the detected non template miRNA | |
| 191 if temp_mature in unique_seq[j][9]: | |
| 192 | |
| 193 off_part = longestSubstring(temp_mature, unique_seq[j][9]) | |
| 194 | |
| 195 mat_diff = temp_mature.split(off_part) | |
| 196 mat_diff = [len(mat_diff[0]), len(mat_diff[1])] | |
| 197 | |
| 198 unique_diff = unique_seq[j][9].split(off_part) | |
| 199 if len(unique_diff)<=2: | |
| 200 unique_diff = [len(unique_diff[0]), len(unique_diff[1])] | |
| 201 | |
| 202 pre_pos = mat_diff[0]-unique_diff[0] | |
| 203 post_pos = unique_diff[1]-mat_diff[1] | |
| 204 | |
| 205 lengthofmir = len(off_part) + post_pos | |
| 206 if pre_pos == 0 and post_pos<4: | |
| 207 tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1]) | |
| 208 tmp_count_seq = tmp_count_seq + 1 | |
| 209 | |
| 210 t_name=copy.deepcopy(unique_seq[j]) | |
| 211 t_name[2]=mature_mirnas[i - 1] + "_nont_" + str(pre_pos) + "_" + '{:+d}'.format(post_pos) + "_" + str(unique_seq[j][9][len(off_part):]) | |
| 212 uni_seq.append(t_name) | |
| 213 | |
| 214 # metrics for the production of database | |
| 215 if tmp_count_reads != 0 and tmp_count_seq != 0: | |
| 216 sorted_uni_arms.append([mature_mirnas[i-1], tmp_count_seq, tmp_count_reads]) | |
| 217 | |
| 218 | |
| 219 sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True) | |
| 220 unique_seq = list(map(list, OrderedDict.fromkeys(map(tuple,uni_seq)))) | |
| 221 | |
| 222 # Output variables | |
| 223 | |
| 224 l.acquire() | |
| 225 if case == "c" or case == "t": | |
| 226 names_n_seqs.extend([[y[2],y[9]] for y in unique_seq if y[2]!="*"]) | |
| 227 n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"]) | |
| 228 file_order.append(file) | |
| 229 data.append([case,file,unique_seq,sorted_uni_arms]) | |
| 230 l.release() | |
| 231 | |
| 232 ################################################################################################################################################################################################################# | |
| 233 | |
| 234 """ | |
| 235 | |
| 236 This function detects the differences between the two groups (control, treated). | |
| 237 Then copy the undetected miRNAs from one group to other and add zeros as counts. | |
| 238 With this way the two groups will have the same number of miRNAs. | |
| 239 | |
| 240 """ | |
| 241 | |
| 242 def black_white(mirna_names_1,mirna_names_2,group,manager): | |
| 243 | |
| 244 add_names = [x for x in mirna_names_1 if x not in mirna_names_2] | |
| 245 add_names.sort() | |
| 246 add_names = list(add_names for add_names,_ in itertools.groupby(add_names)) | |
| 247 | |
| 248 group.sort() | |
| 249 group = list(group for group,_ in itertools.groupby(group)) | |
| 250 | |
| 251 zeros=["0"]*(len(group[0])-2) | |
| 252 [add_names[i].extend(zeros) for i,_ in enumerate(add_names)] | |
| 253 group=group+add_names | |
| 254 | |
| 255 manager.extend(group) | |
| 256 | |
| 257 ########################################################################################################> | |
| 258 | |
| 259 """ | |
| 260 | |
| 261 This function collapses the miRNAs with same sequences and different names into one entry | |
| 262 by merging all the different names into one and are separated with the character "/" | |
| 263 | |
| 264 """ | |
| 265 | |
| 266 def merging_dupes(group,f_dupes): | |
| 267 | |
| 268 dupes=[] | |
| 269 temp_mat =[] | |
| 270 | |
| 271 for num,_ in enumerate(group): | |
| 272 | |
| 273 if group[num][1] not in temp_mat and group[num][0] not in temp_mat: | |
| 274 temp_mat.append(group[num][1]) | |
| 275 temp_mat.append(group[num][0]) | |
| 276 else: | |
| 277 dupes.append(group[num][1]) | |
| 278 | |
| 279 | |
| 280 dupes=list(set(dupes)) | |
| 281 | |
| 282 dupes=[[x] for x in dupes] | |
| 283 | |
| 284 for x in group: | |
| 285 for y in dupes: | |
| 286 if x[1]==y[0]: | |
| 287 fl=0 | |
| 288 if len(y)==1: | |
| 289 y.append(x[0]) | |
| 290 else: | |
| 291 for i in range(1,len(y)): | |
| 292 if y[i].split("_")[0]+"_"+y[i].split("_")[1]==x[0].split("_")[0]+"_"+x[0].split("_")[1]: | |
| 293 fl=1 | |
| 294 if len(x[0])<len(y[i]): | |
| 295 del y[i] | |
| 296 y.append(x[0]) | |
| 297 break | |
| 298 | |
| 299 if fl==0: | |
| 300 y.append((x[0])) | |
| 301 | |
| 302 for y in dupes: | |
| 303 if len(y)>2: | |
| 304 for i in range(len(y)-1,1,-1): | |
| 305 y[1]=y[1]+"/"+y[i] | |
| 306 del y[i] | |
| 307 | |
| 308 f_dupes.extend(dupes) | |
| 309 | |
| 310 | |
| 311 ########################################################################################################> | |
| 312 | |
| 313 """ | |
| 314 | |
| 315 This function removes the duplications of sequences based on output from the fuction merging_dupes | |
| 316 | |
| 317 """ | |
| 318 | |
| 319 def apply_merging_dupes(group,dupes,managger): | |
| 320 | |
| 321 for x in group: | |
| 322 for y in dupes: | |
| 323 if x[1]==y[0]: | |
| 324 x[0]=y[1] | |
| 325 | |
| 326 group.sort() | |
| 327 group=list(group for group,_ in itertools.groupby(group)) | |
| 328 managger.extend(group) | |
| 329 | |
| 330 ########################################################################################################> | |
| 331 | |
| 332 """ | |
| 333 | |
| 334 This function is optional and performs a filter for low counts miRNAs based on | |
| 335 number of counts and the percentage of the samples, according to user preferences | |
| 336 | |
| 337 """ | |
| 338 | |
| 339 def filter_low_counts(c_group,t_group,fil_c_group,fil_t_group,per,counts): | |
| 340 | |
| 341 t_group_new=[] | |
| 342 c_group_new=[] | |
| 343 | |
| 344 percent=int(per)/100 | |
| 345 c_col_filter=round(percent*(len(c_group[1])-2)) | |
| 346 t_col_filter=round(percent*(len(t_group[1])-2)) | |
| 347 | |
| 348 for i, _ in enumerate(c_group): | |
| 349 c_cols=0 | |
| 350 t_cols=0 | |
| 351 | |
| 352 c_cols=sum([1 for j in range(len(c_group[i])-2) if int(c_group[i][j+2])>=int(counts)]) | |
| 353 t_cols=sum([1 for j in range(len(t_group[i])-2) if int(t_group[i][j+2])>=int(counts)]) | |
| 354 | |
| 355 if c_cols>=c_col_filter or t_cols>=t_col_filter: | |
| 356 t_group_new.append(t_group[i]) | |
| 357 c_group_new.append(c_group[i]) | |
| 358 | |
| 359 fil_c_group.extend(c_group_new) | |
| 360 fil_t_group.extend(t_group_new) | |
| 361 | |
| 362 ################################################################################################################################################################################################################## | |
| 363 | |
| 364 """ | |
| 365 | |
| 366 This function exports the count matrices for every group (controls, treated) | |
| 367 and condition (ref and templated miRNAs, non-templated miRNAs) | |
| 368 | |
| 369 """ | |
| 370 | |
| 371 def write_main(raw_con, raw_tre, fil_con, fil_tre, con_file_order, tre_file_order, flag, n1, n2, per): | |
| 372 | |
| 373 if flag == 1 and int(per)!=-1: | |
| 374 fp = open('Counts/Filtered '+n2 +' Templated Counts', 'w') | |
| 375 fp.write("Name\t") | |
| 376 fp.write("Sequence") | |
| 377 for y in tre_file_order: | |
| 378 fp.write("\t"+y) | |
| 379 | |
| 380 for x in fil_tre: | |
| 381 fp.write("\n%s" % "\t".join(x)) | |
| 382 fp.close() | |
| 383 | |
| 384 fp = open('Counts/Filtered '+n1+' Templated Counts', 'w') | |
| 385 fp.write("Name\t") | |
| 386 fp.write("Sequence") | |
| 387 for y in con_file_order: | |
| 388 fp.write("\t"+y) | |
| 389 | |
| 390 for x in fil_con: | |
| 391 fp.write("\n%s" % "\t".join(x)) | |
| 392 fp.close() | |
| 393 | |
| 394 | |
| 395 if flag == 2 and int(per)!=-1: | |
| 396 fp = open('Counts/Filtered '+n2+' Non-Templated Counts', 'w') | |
| 397 fp.write("Name\t") | |
| 398 fp.write("Sequence") | |
| 399 for y in tre_file_order: | |
| 400 fp.write("\t"+y) | |
| 401 | |
| 402 | |
| 403 for x in fil_tre: | |
| 404 fp.write("\n%s" % "\t".join(x)) | |
| 405 fp.close() | |
| 406 | |
| 407 fp = open('Counts/Filtered '+n1+' Non-Templated Counts', 'w') | |
| 408 fp.write("Name\t") | |
| 409 fp.write("Sequence") | |
| 410 for y in con_file_order: | |
| 411 fp.write("\t"+y) | |
| 412 | |
| 413 for x in fil_con: | |
| 414 fp.write("\n%s" % "\t".join(x)) | |
| 415 fp.close() | |
| 416 | |
| 417 | |
| 418 if flag == 1: | |
| 419 fp = open('Counts/Raw '+n2+' Templated Counts', 'w') | |
| 420 fp.write("Name\t") | |
| 421 fp.write("Sequence") | |
| 422 for y in tre_file_order: | |
| 423 fp.write("\t"+y) | |
| 424 | |
| 425 for x in raw_tre: | |
| 426 fp.write("\n%s" % "\t".join(x)) | |
| 427 fp.close() | |
| 428 | |
| 429 fp = open('Counts/Raw '+n1+' Templated Counts', 'w') | |
| 430 fp.write("Name\t") | |
| 431 fp.write("Sequence") | |
| 432 for y in con_file_order: | |
| 433 fp.write("\t"+y) | |
| 434 | |
| 435 for x in raw_con: | |
| 436 fp.write("\n%s" % "\t".join(x)) | |
| 437 fp.close() | |
| 438 | |
| 439 if flag == 2: | |
| 440 fp = open('Counts/Raw '+n2+' Non-Templated Counts', 'w') | |
| 441 fp.write("Name\t") | |
| 442 fp.write("Sequence") | |
| 443 for y in tre_file_order: | |
| 444 fp.write("\t"+y) | |
| 445 | |
| 446 | |
| 447 for x in raw_tre: | |
| 448 fp.write("\n%s" % "\t".join(x)) | |
| 449 fp.close() | |
| 450 | |
| 451 fp = open('Counts/Raw '+n1+' Non-Templated Counts', 'w') | |
| 452 fp.write("Name\t") | |
| 453 fp.write("Sequence") | |
| 454 for y in con_file_order: | |
| 455 fp.write("\t"+y) | |
| 456 | |
| 457 for x in raw_con: | |
| 458 fp.write("\n%s" % "\t".join(x)) | |
| 459 fp.close() | |
| 460 | |
| 461 #################################################################################################################################################################################################################### | |
| 462 | |
| 463 """ | |
| 464 | |
| 465 This function exports the files of the database with all the info | |
| 466 about every type of the detected miRNAs for every sample | |
| 467 | |
| 468 """ | |
| 469 | |
| 470 def DB_write(con,name,unique_seq,sorted_uni_arms,f): | |
| 471 | |
| 472 if f==1: | |
| 473 if con=="c": | |
| 474 fp = open('split1/'+name, 'w') | |
| 475 | |
| 476 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) | |
| 477 if con=="t": | |
| 478 fp = open('split2/'+name, 'w') | |
| 479 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) | |
| 480 | |
| 481 for i in range(len(sorted_uni_arms)): | |
| 482 temp = [] | |
| 483 for j in range(len(unique_seq)): | |
| 484 | |
| 485 if sorted_uni_arms[i][0] in (unique_seq[j][2].split("_")[0]+"_"+unique_seq[j][2].split("_")[1]): | |
| 486 | |
| 487 temp.append(unique_seq[j]) | |
| 488 | |
| 489 temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True) | |
| 490 fp.write("*********************************************************************************************************\n") | |
| 491 fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|")) | |
| 492 fp.write("*********************************************************************************************************\n\n") | |
| 493 [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp] | |
| 494 fp.write("\n" + "\n") | |
| 495 fp.close() | |
| 496 | |
| 497 if f==2: | |
| 498 | |
| 499 if con=="c": | |
| 500 fp = open('split3/'+name, 'w') | |
| 501 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) | |
| 502 if con=="t": | |
| 503 fp = open('split4/'+name, 'w') | |
| 504 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) | |
| 505 | |
| 506 for i in range(len(sorted_uni_arms)): | |
| 507 temp = [] | |
| 508 for j in range(len(unique_seq)): | |
| 509 if sorted_uni_arms[i][0]==unique_seq[j][2].split("_nont_")[0]: | |
| 510 temp.append(unique_seq[j]) | |
| 511 if temp!=[]: | |
| 512 temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True) | |
| 513 fp.write("*********************************************************************************************************\n") | |
| 514 fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|")) | |
| 515 fp.write("*********************************************************************************************************\n\n") | |
| 516 [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp] | |
| 517 fp.write("\n" + "\n") | |
| 518 fp.close() | |
| 519 | |
| 520 | |
| 521 ######################################################################################################################### | |
| 522 | |
| 523 """ | |
| 524 | |
| 525 This function merges the different names for the same mirna sequence per group (controls, treated) to avoid duplicates | |
| 526 | |
| 527 """ | |
| 528 | |
| 529 def merging_names(ini_mat,new): | |
| 530 | |
| 531 dupes=[] | |
| 532 temp_mat =[] | |
| 533 | |
| 534 for num in range(len(ini_mat)): | |
| 535 | |
| 536 if ini_mat[num][1] not in temp_mat and ini_mat[num][0] not in temp_mat: | |
| 537 temp_mat.append(ini_mat[num][1]) | |
| 538 temp_mat.append(ini_mat[num][0]) | |
| 539 else: | |
| 540 dupes.append(ini_mat[num][1]) | |
| 541 | |
| 542 dupes=list(set(dupes)) | |
| 543 | |
| 544 for i in range(len(dupes)): | |
| 545 dupes[i]=[dupes[i]] | |
| 546 | |
| 547 for x in ini_mat: | |
| 548 for y in dupes: | |
| 549 if x[1]==y[0]: | |
| 550 fl=0 | |
| 551 if len(y)==1: | |
| 552 y.append(x[0]) | |
| 553 else: | |
| 554 for i in range(1,len(y)): | |
| 555 if y[i].split("_")[0]+"_"+y[i].split("_")[1]==x[0].split("_")[0]+"_"+x[0].split("_")[1]: | |
| 556 fl=1 | |
| 557 if len(x[0])<len(y[i]): | |
| 558 del y[i] | |
| 559 y.append(x[0]) | |
| 560 break | |
| 561 | |
| 562 if fl==0: | |
| 563 y.append((x[0])) | |
| 564 | |
| 565 for y in dupes: | |
| 566 if len(y)>2: | |
| 567 for i in range(len(y)-1,1,-1): | |
| 568 y[1]=y[1]+"/"+y[i] | |
| 569 del y[i] | |
| 570 | |
| 571 | |
| 572 for x in ini_mat: | |
| 573 for y in dupes: | |
| 574 if x[1]==y[0]: | |
| 575 x[0]=y[1] | |
| 576 | |
| 577 ini_mat.sort() | |
| 578 ini_mat=list(ini_mat for ini_mat,_ in itertools.groupby(ini_mat)) | |
| 579 new.extend(ini_mat) | |
| 580 | |
| 581 #################################################################################################################################################################################################################### | |
| 582 | |
| 583 """ | |
| 584 | |
| 585 This function exports the count matrices for differential expresion | |
| 586 if user chose analysis with non-templated miRNAs detection | |
| 587 | |
| 588 """ | |
| 589 | |
| 590 def nontemp_counts_to_diff(tem_names,tem_samp,non_names,non_samp,folder,pro): | |
| 591 | |
| 592 for i in range(2,len(tem_samp[0])): | |
| 593 | |
| 594 fp = open(folder+tem_names[i-2]+'.txt','w') | |
| 595 fp.write("miRNA id"+"\t"+tem_names[i-2]+"\n") | |
| 596 | |
| 597 for x in tem_samp: | |
| 598 fp.write("%s" % "\t".join([x[0],x[i]])+"\n") | |
| 599 | |
| 600 for j in range(len(non_names)): | |
| 601 if non_names[j]==tem_names[i-2]: | |
| 602 for x in non_samp: | |
| 603 fp.write("%s" % "\t".join([x[0],x[j+2]])+"\n") | |
| 604 fp.close() | |
| 605 | |
| 606 ################################################################################################################################################################################################################# | |
| 607 | |
| 608 """ | |
| 609 | |
| 610 This function exports the count matrices for differential expresion | |
| 611 if user chose analysis only with templated miRNAs detection | |
| 612 | |
| 613 """ | |
| 614 | |
| 615 def temp_counts_to_diff(names,samp,folder,pro): | |
| 616 | |
| 617 for i in range(2,len(samp[0])): | |
| 618 | |
| 619 fp = open(folder+names[i-2]+'.txt','w') | |
| 620 fp.write("miRNA id"+"\t"+names[i-2]+"\n") | |
| 621 | |
| 622 for x in samp: | |
| 623 fp.write("%s" % "\t".join([x[0],x[i]])+"\n") | |
| 624 fp.close() | |
| 625 | |
| 626 ################################################################################################################################################################################################################# | |
| 627 | |
| 628 """ | |
| 629 | |
| 630 This function downloads the fasta files from MirGene site | |
| 631 with ref miRNAs and star miRNAs sequences and merges them | |
| 632 into one list | |
| 633 | |
| 634 """ | |
| 635 | |
| 636 def download_matures(matures,org_name): | |
| 637 | |
| 638 mature_mir=[] | |
| 639 | |
| 640 mat_url = 'http://mirgenedb.org/fasta/'+org_name+'?mat=1' | |
| 641 star_url = 'http://mirgenedb.org/fasta/'+org_name+'?star=1' | |
| 642 | |
| 643 data = urllib.request.urlopen(mat_url).read() | |
| 644 file_mirna = data.decode('utf-8') | |
| 645 mature_mir = file_mirna.split("\n") | |
| 646 mature_mir = [x.replace(">","") for x in mature_mir] | |
| 647 del mature_mir[-1] | |
| 648 | |
| 649 data = urllib.request.urlopen(star_url).read() | |
| 650 file_mirna = data.decode('utf-8') | |
| 651 star_mir = file_mirna.split("\n") | |
| 652 star_mir = [x.replace(">","") for x in star_mir] | |
| 653 del star_mir[-1] | |
| 654 | |
| 655 mature_mir.extend(star_mir) | |
| 656 | |
| 657 for i in range(1,len(mature_mir),2): | |
| 658 mature_mir[i]=mature_mir[i].replace("U","T") | |
| 659 | |
| 660 matures.extend(mature_mir) | |
| 661 | |
| 662 ################################################################################################################### | |
| 663 | |
| 664 """ | |
| 665 | |
| 666 This function detects the templated isoforms from the 1st part of analysis | |
| 667 These isoforms and ref miRNAs will be used for the detection of non-templated miRNAs | |
| 668 | |
| 669 """ | |
| 670 | |
| 671 def non_template_ref(sc,st,all_isoforms): | |
| 672 | |
| 673 pre_uni_seq_con = list(sc) | |
| 674 pre_uni_seq_tre = list(st) | |
| 675 | |
| 676 for x in pre_uni_seq_con: | |
| 677 for y in x: | |
| 678 if y[2] not in all_isoforms and "_t_" in y[2]: | |
| 679 all_isoforms.append(y[2]) | |
| 680 all_isoforms.append(y[9]) | |
| 681 | |
| 682 for x in pre_uni_seq_tre: | |
| 683 for y in x: | |
| 684 if y[2] not in all_isoforms and "_t_" in y[2]: | |
| 685 all_isoforms.append(y[2]) | |
| 686 all_isoforms.append(y[9]) | |
| 687 | |
| 688 ################################################################################################################################################################################################ | |
| 689 | |
| 690 """ | |
| 691 | |
| 692 This function adds uncommon detected mirnas among the samples with zeros as counts | |
| 693 | |
| 694 """ | |
| 695 | |
| 696 def uncommon_mirnas(sample,mir_names,l,new_d,sample_name,sample_order): | |
| 697 | |
| 698 for y in mir_names: | |
| 699 flag=0 | |
| 700 for x in sample: | |
| 701 if y[0]==x[0]: # check if miRNA exists in the sample | |
| 702 flag=1 | |
| 703 break | |
| 704 if flag==0: | |
| 705 sample.append([y[0],"0",y[1]]) # add the name of mirna to the sample with zero counts and its sequence | |
| 706 | |
| 707 # sorting and remove duplicates | |
| 708 sample.sort(key=lambda x: x[0]) | |
| 709 sample=list(sample for sample,_ in itertools.groupby(sample)) | |
| 710 | |
| 711 # Return the updated sample | |
| 712 l.acquire() | |
| 713 new_d.append(sample) | |
| 714 sample_order.append(sample_name) | |
| 715 l.release() | |
| 716 | |
| 717 ############################################################################################################################################################################################### |
