Mercurial > repos > gdroc > scaffhunter
view scaffhunter/reorderient.py @ 2:ec31e4883c99 draft default tip
Uploaded
author | gdroc |
---|---|
date | Mon, 14 Nov 2016 08:04:53 -0500 |
parents | 9c61692acd7b |
children |
line wrap: on
line source
# # Copyright 2014 CIRAD # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, see <http://www.gnu.org/licenses/> or # write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, # MA 02110-1301, USA. # # import optparse, os, shutil, subprocess, sys, tempfile, fileinput, ConfigParser, operator, time, random def factorielle(x): if x < 2: return 1 else: return x * factorielle(x-1) def calcul_score_id(MAT, ORDRE, MARK, ORIENT, DIC_INDEX, NB): #on cree l'ordre des markers scaff_ordre = list(ORDRE) ordre = [] for n in scaff_ordre: liste = list(MARK[n]) if ORIENT[n] == 'REV': liste.reverse() ordre = ordre + liste #on enregistre les positions relative des markers dans cet ordre liste_ordre = {} i = 0 for n in ordre: liste_ordre[n] = i i += 1 #on calcule le score score = 0 while len(scaff_ordre) > 1: scaff_fait = scaff_ordre[0] del scaff_ordre[0] # print scaff_fait, MARK[scaff_fait] for n in MARK[scaff_fait]: mark = DIC_INDEX[n] for j in scaff_ordre: for k in MARK[j]: val = MAT[mark][k] if val != 999999999: score = score + ((1-(abs(liste_ordre[n]-liste_ordre[k])/NB))*val) return score def calcul_score_dif(MAT, ORDRE, MARK, ORIENT, DIC_INDEX, NB): #on cree l'ordre des markers scaff_ordre = list(ORDRE) ordre = [] for n in scaff_ordre: liste = list(MARK[n]) if ORIENT[n] == 'REV': liste.reverse() ordre = ordre + liste #on enregistre les positions relative des markers dans cet ordre liste_ordre = {} i = 0 for n in ordre: liste_ordre[n] = i i += 1 #on calcule le score score = 0 while len(scaff_ordre) > 1: scaff_fait = scaff_ordre[0] del scaff_ordre[0] # print scaff_fait, MARK[scaff_fait] for n in MARK[scaff_fait]: mark = DIC_INDEX[n] for j in scaff_ordre: for k in MARK[j]: val = MAT[mark][k] if val != 999999999: score = score + ((abs(liste_ordre[n]-liste_ordre[k])/NB)*val) return score def rear_fait(ORD, ORIENT): mot = ''.join(ORD) for n in ORD: mot = mot+ORIENT[n] return mot def reorderient(ORDRE, ORIENT): NEW_ORDRE = list(ORDRE) NEW_ORIENT = {} for n in ORIENT: NEW_ORIENT[n] = ORIENT[n] if random.randint(0,1) == 0: #On travail sur 1 scaffold pos = random.randint(0, (len(ORDRE)-1)) scaffold = NEW_ORDRE[pos] if random.randint(0,1) == 0: #On fait un deplacement de scaffold pos_new = pos while pos_new == pos: pos_new = random.randint(0, (len(ORDRE)-1)) del NEW_ORDRE[pos] if scaffold in NEW_ORDRE: sys.exit('bug') NEW_ORDRE.insert(pos_new, scaffold) if random.randint(0,1) == 0: #On fait en plus une reorientation if NEW_ORIENT[scaffold] == 'FWD': NEW_ORIENT[scaffold] = 'REV' elif NEW_ORIENT[scaffold] == 'REV': NEW_ORIENT[scaffold] = 'FWD' else: sys.exit('bug') else: #On fait juste une reorientation if NEW_ORIENT[scaffold] == 'FWD': NEW_ORIENT[scaffold] = 'REV' elif NEW_ORIENT[scaffold] == 'REV': NEW_ORIENT[scaffold] = 'FWD' else: sys.exit('bug') else: pos_debut = random.randint(0, (len(ORDRE)-2)) pos_fin = pos_debut while (pos_debut + 1) >= pos_fin and len(NEW_ORDRE[pos_debut:pos_fin]) == len(NEW_ORDRE): pos_fin = random.randint(1, len(ORDRE)) scaffold = NEW_ORDRE[pos_debut:pos_fin] del NEW_ORDRE[pos_debut:pos_fin] if random.randint(0,1) == 0: #On fait un deplacement de scaffold pos_new = pos_debut while pos_new == pos_debut: pos_new = random.randint(0, len(NEW_ORDRE)) if random.randint(0,1) == 0: #On fait en plus une reorientation for n in scaffold: NEW_ORDRE.insert(pos_new, n) if NEW_ORIENT[n] == 'FWD': NEW_ORIENT[n] = 'REV' elif NEW_ORIENT[n] == 'REV': NEW_ORIENT[n] = 'FWD' else: sys.exit('bug') else: #On fait juste un deplacement for n in scaffold: NEW_ORDRE.insert(pos_new, n) pos_new += 1 else: #On fait juste une reorientation for n in scaffold: NEW_ORDRE.insert(pos_debut, n) if NEW_ORIENT[n] == 'FWD': NEW_ORIENT[n] = 'REV' elif NEW_ORIENT[n] == 'REV': NEW_ORIENT[n] = 'FWD' else: sys.exit('bug') if len(NEW_ORDRE) != len(ORDRE): sys.exit('bug') return [NEW_ORDRE, NEW_ORIENT] def __main__(): #Parse Command Line parser = optparse.OptionParser(usage="python %prog [options]\n\nProgram designed by Guillaume MARTIN : guillaume.martin@cirad.fr\n\n" "This program take in input a matrix containing maker linkage or divergence and the two output files of UPGMA.py and try to optimize the " "scaffold order and orientation proposed by UPGMA.py.\n" "The optimization is performed by calculating a score for the scaffold order, trying rearrangements (scaffold or scaffold group permutations " "and/or inversion) followed by score re-calculation. If the new score is better than the previous, the new order is conserved; else, the previous " "order is conserved. The program stops when more than a defined number of consecutive rearrangement (passed in --iter argument) do not improve the ordering.\n" "This program output a file containing ordered scaffolds in a table format and a table file containing ordered markers relative to the scaffold order.\n" ) # Wrapper options. parser.add_option( '', '--mat', dest='mat', default=None, help='Matrix file with pairwise statistics between markers.') parser.add_option( '', '--scaff', dest='scaff', default=None, help='The output file passed in --out1 argument of UPGMA.py. This file contains ordered and orientated scaffolds ate the end of UPGMA.py program.') parser.add_option( '', '--mark', dest='mark', default=None, help='A table file containing in column 1: markers name, column2: scaffold name. This is the file generated by UPGMA.py in --out2 argument') parser.add_option( '', '--iter', dest='iter', default='auto', help='Number of consecutive rearrangement tried without improvement to stop the improvement of the ordering and orientation (integer or auto). [default: %default]') parser.add_option( '', '--type', dest='type', default='IDENT', help='Data type : IDENT (marker linkage) or DIF (recombination rate). [default: %default]') parser.add_option( '', '--out1', dest='out1', default='UPGMA_ordered_opt_scaf.txt', help='Output file name of ordered scaffold. [default: %default]') parser.add_option( '', '--out2', dest='out2', default='UPGMA_ordered_opt_mark.txt', help='Output file name of ordered markers. [default: %default]') (options, args) = parser.parse_args() if options.scaff == None: sys.exit('--scaff argument is missing') if options.mark == None: sys.exit('--mark argument is missing') if options.mat == None: sys.exit('--mat argument is missing') #on enregistre dans une liste les markers dans l'ordre dans la matrice os.system('echo "Registering scaffold order"') file = open(options.mat) index_mat = file.readline().split()[1:] file.close() # On enregistre l'orientation des scaffolds os.system('echo "Registering scaffold orientation"') dico_orient = {} file = open(options.scaff) for line in file: data = line.split() if data: dico_orient[data[0]] = data[1] file.close() #on enregistre dans un dico les scaffolds et l'ordre des positions (dans la matrice) des marqueurs dans le scaffold. Dans une liste on enregistre l'ordre des scaffolds os.system('echo "Registering scaffold order and markers order"') file = open(options.mark) dico_mark = {}#pour chaque cle = scaffold, on enregistre l'ordre des markers dans ce scaffold liste_ordre = []#Ordre des scaffolds dico_index = {}#position du markers dans la matrice # dico_orient = {}####### nb_mark = 0 for line in file: data = line.split() if data: nb_mark += 1 if data[1] in dico_mark: dico_index[index_mat.index(data[0])] = data[0] if dico_orient[data[1]] == 'FWD': dico_mark[data[1]].append(index_mat.index(data[0])) elif dico_orient[data[1]] == 'REV': dico_mark[data[1]].insert(0, index_mat.index(data[0])) else: sys.exit('bugggggg') else: dico_mark[data[1]] = [] dico_mark[data[1]].append(index_mat.index(data[0])) dico_index[index_mat.index(data[0])] = data[0] liste_ordre.append(data[1]) # dico_orient[data[1]] = 'FWD'################ file.close() nb_mark = float(nb_mark) #On verifie le nombre d'iteration total_possible = ((float(len(liste_ordre)-1)**2)*2)+(len(liste_ordre)) if options.iter == 'auto': iteration = total_possible os.system('echo "The number of iteration is automatically estimated to the number of possible iterations in case of one scaffold repositioned : '+str(total_possible)+'"') elif options.iter == 'all': os.system('echo "All possible rearrangments are tested"') else: iteration = int(options.iter) if iteration > total_possible: os.system('echo "The number of iteration is over the number of possible iterations in case of one scaffold repositioned. It has been ajusted to : '+str(total_possible)+' if not the script can be in a infinite loop"') iteration = total_possible else: os.system('echo "The number of iteration is under the number of possible iterations in case of one scaffold repositioned ('+str(total_possible)+'). You choose : '+str(iteration)+'"') #on charge la matrice dans un dico os.system('echo "Recording pairwise information"') matrix = {} info_scaff = {} file = open(options.mat) file.readline() for line in file: data = line.split() if data: if data[0] in matrix: sys.exit("Warning there is redundancy in markers names, verify your data") else: matrix[data[0]] = map(float, data[1:]) file.close() if options.type == 'IDENT': #On essaye toutes les orientations possibles de cet ordre os.system('echo "Looking for best orientation in this order"') t0 = time.clock() score = calcul_score_id(matrix, liste_ordre, dico_mark, dico_orient, dico_index, nb_mark) os.system('echo "Reference score : '+str(score)+'; time spent in calculation : '+str(time.clock()-t0)+'"') for n in liste_ordre: t0 = time.clock() dico_reor = {} for j in dico_orient: if j == n: if dico_orient[j] == 'FWD': dico_reor[j] = 'REV' elif dico_orient[j] == 'REV': dico_reor[j] = 'FWD' else: sys.exit('bug') # print n, dico_reor[j], dico_orient[n] else: dico_reor[j] = str(dico_orient[j]) score_reor = calcul_score_id(matrix, liste_ordre, dico_mark, dico_reor, dico_index, nb_mark) os.system('echo "Reorient score : '+str(score_reor)+'; time spent in calculation : '+str(time.clock()-t0)+'"') if score < score_reor: dico_orient = {} for k in dico_reor: dico_orient[k] = str(dico_reor[k]) score = score_reor #On fait des rearrangements os.system('echo "Performing rearrangements"') iter = 0 fait = set() fait.add(rear_fait(liste_ordre, dico_orient)) while iter < iteration: t0 = time.clock() iter += 1 REOR = reorderient(liste_ordre,dico_orient) j = 0 while rear_fait(REOR[0], REOR[1]) in fait: j += 1 REOR = reorderient(liste_ordre,dico_orient) fait.add(rear_fait(REOR[0], REOR[1])) score_reor = calcul_score_id(matrix, REOR[0], dico_mark, REOR[1], dico_index, nb_mark) os.system('echo "iteration : '+str(iter)+'; Reoriented score : '+str(score_reor)+', best score : '+str(score)+'; time spent in calculation : '+str(time.clock()-t0)+'"') if score < score_reor: dico_orient = {} for n in REOR[1]: dico_orient[n] = REOR[1][n] score = score_reor liste_ordre = list(REOR[0]) iter = 0 fait = set() fait.add(rear_fait(liste_ordre, dico_orient)) #On fait une derniere etape pour voir ce qui est oriente et ce qui ne l'est pas os.system('echo "Looking for orientated scaffolds and those who are not"') os.system('echo "Reference score : '+str(score)+'; time spent in calculation : '+str(time.clock()-t0)+'"') dico_ordre_final = {} for n in liste_ordre: t0 = time.clock() dico_reor = {} nom_ord = n for j in dico_orient: if j == n: if dico_orient[j] == 'FWD': dico_reor[j] = 'REV' else: dico_reor[j] = 'FWD' else: dico_reor[j] = dico_orient[j] score_reor = calcul_score_id(matrix, liste_ordre, dico_mark, dico_reor, dico_index, nb_mark) os.system('echo "Reorient score : '+str(score_reor)+'; time spent in calculation : '+str(time.clock()-t0)+'"') if score < score_reor: dico_orient = {} for k in dico_reor: dico_orient[k] = dico_reor[k] score = score_reor dico_ordre_final[n] = 'Ord' elif score == score_reor: dico_ordre_final[n] = 'NoOrd' else: dico_ordre_final[n] = 'Ord' elif options.type == 'DIF': #On essaye toutes les orientations possibles de cet ordre os.system('echo "Looking for best orientation in this order"') t0 = time.clock() score = calcul_score_dif(matrix, liste_ordre, dico_mark, dico_orient, dico_index, nb_mark) os.system('echo "Reference score : '+str(score)+'; time spent in calculation : '+str(time.clock()-t0)+'"') for n in liste_ordre: t0 = time.clock() dico_reor = {} for j in dico_orient: if j == n: if dico_orient[j] == 'FWD': dico_reor[j] = 'REV' elif dico_orient[j] == 'REV': dico_reor[j] = 'FWD' else: sys.exit('bug') else: dico_reor[j] = dico_orient[j] score_reor = calcul_score_dif(matrix, liste_ordre, dico_mark, dico_reor, dico_index, nb_mark) os.system('echo "Reorient score : '+str(score_reor)+'; time spent in calculation : '+str(time.clock()-t0)+'"') if score < score_reor: dico_orient = {} for k in dico_reor: dico_orient[k] = dico_reor[k] score = score_reor #On fait des rearrangements os.system('echo "Performing rearrangements"') iter = 0 fait = set() fait.add(rear_fait(liste_ordre, dico_orient)) while iter < iteration: t0 = time.clock() iter += 1 REOR = reorderient(liste_ordre,dico_orient) while rear_fait(REOR[0], REOR[1]) in fait: REOR = reorderient(liste_ordre,dico_orient) fait.add(rear_fait(REOR[0], REOR[1])) score_reor = calcul_score_dif(matrix, REOR[0], dico_mark, REOR[1], dico_index, nb_mark) os.system('echo "iteration : '+str(iter)+'; Reoriented score : '+str(score_reor)+', best score : '+str(score)+'; time spent in calculation : '+str(time.clock()-t0)+'"') if score < score_reor: dico_orient = {} for n in REOR[1]: dico_orient[n] = REOR[1][n] score = score_reor liste_ordre = list(REOR[0]) iter = 0 fait = set() fait.add(rear_fait(liste_ordre, dico_orient)) #On fait une derniere etape pour voir ce qui est oriente et ce qui ne l'est pas os.system('echo "Looking for orientated scaffolds and those who are not"') os.system('echo "Reference score : '+str(score)+'; time spent in calculation : '+str(time.clock()-t0)+'"') dico_ordre_final = {} for n in liste_ordre: t0 = time.clock() dico_reor = {} nom_ord = n for j in dico_orient: if j == n: if dico_orient[j] == 'FWD': dico_reor[j] = 'REV' else: dico_reor[j] = 'FWD' else: dico_reor[j] = dico_orient[j] score_reor = calcul_score_dif(matrix, liste_ordre, dico_mark, dico_reor, dico_index, nb_mark) os.system('echo "Reorient score : '+str(score_reor)+'; time spent in calculation : '+str(time.clock()-t0)+'"') if score < score_reor: dico_orient = {} for k in dico_reor: dico_orient[k] = dico_reor[k] score = score_reor dico_ordre_final[n] = 'Ord' elif score == score_reor: dico_ordre_final[n] = 'NoOrd' if dico_orient[n] == 'REV': # dico_orient = {} # for k in dico_reor: # dico_orient[k] = dico_reor[k] # else: dico_ordre_final[n] = 'Ord' else: sys.exit('Wrong argument passed in --type') #On genere le fichier pour matrix to orthodother os.system('echo "Outputing ordonned markers"') outfile1 = open(options.out1,'w') outfile = open(options.out2,'w') for n in liste_ordre: outfile1.write(n+'\t'+dico_orient[n]+'\t'+dico_ordre_final[n]+'\n') liste = list(dico_mark[n]) if dico_orient[n] == 'REV': liste.reverse() for j in liste: outfile.write(dico_index[j]+'\t'+n+'\n') outfile.close() outfile1.close() if __name__ == "__main__": __main__()