Mercurial > repos > izsam > phylogeny_converter
diff converter.py @ 0:37392af48c37 draft default tip
Uploaded
author | izsam |
---|---|
date | Thu, 19 Mar 2015 11:46:50 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/converter.py Thu Mar 19 11:46:50 2015 -0400 @@ -0,0 +1,535 @@ + +#!/usr/bin/env python + +import sys +import string +import os + +class convertitori: + def __init__(self,input,inouttype,type,output): + self.input = input + self.inouttype = inouttype + self.type = type + self.output = output + + def fp(self): + count = 0 + cseq = 0 + fasta = [] + for riga in self.input: + count += 1 + if ">" in riga: + f = "" + p = self.input.index(riga,count-1) + c = 1 + y = riga[1:].replace(" ","_") + if y >= 10: + f = f + y[:10] + '\t' + else: + f = f + y + "_"*(10-len(y)) + '\t' + try: + while ">" not in self.input[p+c]: + f = f + (self.input[p+c].strip()) + c += 1 + except: + pass + fasta.append(f) + num = str(len(fasta)) + lun = str(len(fasta[0].split("\t")[1])) + for sequence in fasta: + if str(len(sequence.split("\t")[1])) != lun: + sys.exit("The input file does not contains a multiple alignment in fasta format. Please ensure that all the sequences have the same length") + self.output.write(num + '\t' + lun + '\n') + for seq in fasta: + self.output.write(seq + '\n') + + def fn(self): + count = 0 + fasta = [] + for riga in self.input: + count += 1 + if ">" in riga: + f = "" + p = self.input.index(riga,count-1) + c = 1 + y = riga[1:].replace(" ","_") + if y >= 10: + f = f + y[:10] + '\t' + else: + f = f + y + "_"*(10-len(y)) + '\t' + try: + while ">" not in self.input[p+c]: + f = f + (self.input[p+c].strip()) + c += 1 + except: + pass + fasta.append(f) + num = str(len(fasta)) + lun = str(len(fasta[0].split("\t")[1])) + for sequence in fasta: + if str(len(sequence.split("\t")[1])) != lun: + sys.exit("The input file does not contains a multiple alignment in fasta format. Please ensure that all the sequences have the same length") + self.output.write("#NEXUS\n\nBEGIN DATA;\nDIMENSIONS NTAX=%s NCHAR=%s;\nFORMAT DATATYPE=DNA INTERLEAVE MISSING=-;\n\nMATRIX\n"%(num,lun)) + porzioni = int(lun)/100 + for volte in range(porzioni): + for seq in fasta: + part = "" + self.output.write(seq.split("\t")[0] + '\t') + cont = 0 + for chara in seq.split("\t")[1][volte*100:(volte+1)*100]: + cont += 1 + part = part + chara + if cont%20.0 == 0: + part = part + " " + part = part + "\n" + self.output.write(part) + self.output.write("\n\n\n") + for seq in fasta: + part = "" + cont = 0 + self.output.write(seq.split("\t")[0] + '\t') + for chara in seq.split("\t")[1][(volte+1)*100:]: + cont += 1 + part = part + chara + if cont%20.0 == 0: + part = part + " " + part = part + "\n" + self.output.write(part) + + def pn(self): + num = int(self.input[0].split()[0]) + lun = float(self.input[0].split()[1]) + lunf = float(len(self.input)) + self.output.write("#NEXUS\n\nBEGIN DATA;\nDIMENSIONS NTAX=%s NCHAR=%s;\nFORMAT DATATYPE=DNA INTERLEAVE MISSING=-;\n\nMATRIX\n"%(int(num),lun)) + spia = 0 + porzioni = int(lun)/100 + if (lunf-1)/num == 1.0: + spia = 1 + if spia == 1: + for volte in range(porzioni): + for seq in self.input[1:]: + part = "" + self.output.write(seq.split("\t")[0] + '\t') + cont = 0 + for chara in seq.split("\t")[1][volte*100:(volte+1)*100]: + cont += 1 + part = part + chara + if cont%20.0 == 0: + part = part + " " + part = part + "\n" + self.output.write(part) + self.output.write("\n\n\n") + for seq in self.input[1:]: + part = "" + cont = 0 + self.output.write(seq.split("\t")[0] + '\t') + for chara in seq.split("\t")[1][(volte+1)*100:]: + cont += 1 + part = part + chara + if cont%20.0 == 0: + part = part + " " + part = part + "\n" + self.output.write(part) + else: + if len(self.input[1])<=11: + for volte in range(porzioni): + interm = 0 + for seq in self.input[1:]: + if seq == "\n": + interm += 1 + if (self.input.index(seq)+interm)%2 == 0 and seq != "\n": + part = "" + cont = 0 + for chara in seq[volte*100:(volte+1)*100]: + cont += 1 + part = part + chara + if cont%20.0 == 0: + part = part + " " + part = part + "\n" + self.output.write(part) + elif (self.input.index(seq)+interm)%2 != 0 and seq != "\n": + self.output.write(seq[:10] + "\t") + self.output.write("\n\n\n") + interm = 0 + for seq in self.input[1:]: + if seq == "\n": + interm += 1 + if (self.input.index(seq)+interm)%2 == 0 and seq != "\n": + part = "" + cont = 0 + for chara in seq[(volte+1)*100:]: + cont += 1 + part = part + chara + if cont%20.0 == 0: + part = part + " " + part = part + "\n" + self.output.write(part) + elif (self.input.index(seq)+interm)%2 != 0 and seq != "\n": + self.output.write(seq[:10] + "\t") + else: + try: + diz = {} + volta = 0 + for riga in self.input[1:]: + if self.input.index(riga) in range(num+1): + numriga = self.input.index(riga) + diz[self.input.index(riga)] = [self.input[numriga][:10],self.input[numriga][10:].strip().replace(" ","")] + else: + if riga == "\n": + volta += 1 + else: + numriga = self.input.index(riga) + prima = diz[numriga - ((num+1)*volta)][1] + self.input[self.input.index(riga)].strip().replace(" ","") + diz[numriga - ((num+1)*volta)] = [diz[numriga - ((num+1)*volta)][0],prima] + for volte in range(porzioni): + for seq in diz.keys(): + cont = 0 + self.output.write(diz[seq][0] + "\t") + for chara in diz[seq][1][volte*100:(volte+1)*100]: + self.output.write(chara) + cont += 1 + if cont%20.0 == 0: + self.output.write(" ") + self.output.write("\n") + self.output.write("\n\n\n") + for seq in diz.keys(): + cont = 0 + self.output.write(diz[seq][0] + "\t") + for chara in diz[seq][1][(volte+1)*100:]: + self.output.write(chara) + cont += 1 + if cont%20.0 == 0: + self.output.write(" ") + self.output.write("\n") + except: + sys.exit("The input file is not in the proper format. Please check that your file is in Phylip standard interleaved (or sequential) format ") + + def pf(self): + num = int(self.input[0].split()[0]) + lun = float(len(self.input)) + spia = 0 + if (lun-1)/num == 1.0: + spia = 1 + if spia == 1: + for riga in self.input[1:]: + for ele in range(int(lun-1)): + cont = 0 + self.output.write(">" + self.input[ele+1][:10] + "\n") + for char in self.input[ele+1][10:].strip().replace(" ",""): + self.output.write(char) + cont += 1 + if cont%80.0 == 0: + self.output.write('\n') + self.output.write('\n') + else: + if len(self.input[1])<=11: + interm = 0 + for riga in self.input[1:]: + if riga == "\n": + interm += 1 + if (self.input.index(riga)+interm)%2 == 0 and riga != "\n": + cont = 0 + for char in riga.strip().replace(" ",""): + self.output.write(char) + cont += 1 + if cont%80.0 == 0: + self.output.write('\n') + self.output.write('\n') + elif (self.input.index(riga)+interm)%2 != 0 and riga != "\n": + self.output.write(">" + riga[:10] + "\n") + else: + try: + diz = {} + volta = 0 + for riga in self.input[1:]: + if self.input.index(riga) in range(num+1): + numriga = self.input.index(riga) + diz[self.input.index(riga)] = [self.input[numriga][:10],self.input[numriga][10:].strip().replace(" ","")] + else: + if riga == "\n": + volta += 1 + else: + numriga = self.input.index(riga) + prima = diz[numriga - ((num+1)*volta)][1] + self.input[self.input.index(riga)].strip().replace(" ","") + diz[numriga - ((num+1)*volta)] = [diz[numriga - ((num+1)*volta)][0],prima] + for elemento in diz.keys(): + self.output.write(">" + diz[elemento][0] + '\n') + con = 0 + for char in diz[elemento][1]: + self.output.write(char) + con += 1 + if con%80 == 0: + self.output.write('\n') + self.output.write('\n') + except: + sys.exit("The input file is not in the proper format. Please check that your file is in Phylip standard interleaved (or sequential) format ") + + def nf(self): + try: + diz = {} + spia = 0 + for riga in self.input: + if "MATRIX" in riga: + spia = 1 + if spia == 1 and "MATRIX" not in riga and riga != "\n": + if riga.split()[0] not in diz.keys(): + diz[riga.split()[0]] = "" + else: + for ele in riga.split()[1:]: + diz[riga.split()[0]] = diz[riga.split()[0]] + ele.strip() + for elemento in diz.keys(): + self.output.write(">" + elemento + '\n') + con = 0 + for char in diz[elemento]: + self.output.write(char) + con += 1 + if con%80 == 0: + self.output.write('\n') + self.output.write('\n') + except: + sys.exit("The input file is not in Nexus format. ") + + def np(self): + try: + diz = {} + spia = 0 + for riga in self.input: + if "MATRIX" in riga: + spia = 1 + if spia == 1 and "MATRIX" not in riga and riga != "\n": + if riga.split()[0] not in diz.keys(): + diz[riga.split()[0]] = "" + else: + for ele in riga.split()[1:]: + diz[riga.split()[0]] = diz[riga.split()[0]] + ele.strip() + num = str(len(diz.keys())) + lun = str(len(diz.values()[0])) + self.output.write(num + '\t' + lun + '\n') + for elemento in diz.keys(): + if elemento >= 10: + nome = elemento[:10] + '\t' + else: + nome = elemento + "_"*(10-len(elemento)) + '\t' + self.output.write(nome + diz[elemento] + '\n') + except: + sys.exit("The input file is not in Nexus format. ") + + + def fg(self): + count = 0 + fasta = [] + for riga in self.input: + count += 1 + if ">" in riga: + f = "" + p = self.input.index(riga,count-1) + c = 1 + y = riga[1:-1].replace(" ","_") + f = f + y + '\t' + try: + while ">" not in self.input[p+c]: + f = f + (self.input[p+c].strip()) + c += 1 + except: + pass + fasta.append(f) + for seq in fasta: + lun = str(len(seq.split("\t")[1])) + self.output.write("LOCUS\t%s\t%s bp\nORIGIN\n"%(seq.split("\t")[0],lun)) + porzioni = int(lun)/60 + cont = 0 + for volte in range(porzioni): + part = "" + self.output.write(str(cont+1) + "\t") + for chara in seq.split("\t")[1][volte*60:(volte+1)*60]: + cont += 1 + part = part + chara + if cont%10.0 == 0: + part = part + " " + self.output.write(part) + self.output.write("\n") + self.output.write(str(cont+1) + "\t") + part = "" + for chara in seq.split("\t")[1][(volte+1)*60:]: + cont += 1 + part = part + chara + if cont%10.0 == 0: + part = part + " " + self.output.write(part) + self.output.write("\n") + self.output.write("//\n\n") + + def gf(self): + for riga in self.input: + if "LOCUS" in riga: + nome = "" + spia = 0 + len = "" + seq = "" + part = riga.split() + for ele in part: + if "bp" in ele: + len = str(riga.index(ele)-1) + nome = part[1] + '\t' + if "DEFINITION" in riga: + part = riga.split() + for ele in part[1:]: + nome = nome + ele + ' ' + if "ORIGIN" in riga: + spia = 1 + if spia == 1 and "ORIGIN" not in riga: + part = riga.split() + for ele in part[1:]: + seq = seq + ele.strip() + if "//" in riga: + self.output.write(">" + nome + '\n') + con = 0 + for char in seq: + self.output.write(char) + con += 1 + if con%80 == 0: + self.output.write('\n') + self.output.write('\n') + spia = 0 + +class check_fileformat: + def __init__(self,inouttype,input): + self.intype = inouttype[0] + self.infile = input + def single(self): + if self.intype == "f": + count = 0 + for riga in self.infile: + if riga[0] == ">": + count += 1 + if count == 1: + if len(self.infile) < 2: + sys.exit("The input file is not in fasta format. Please check that the first row starts with > and that the sequence starts from the second line") + else: + return "ok" + else: + if count >1: + sys.exit("The input file is a multi-fasta file. Please resubmit the job using the 'multi sequence' option") + if count == 0: + sys.exit("The input file is not in fasta format. Please check that the first row starts with > and that the sequence starts from the second line") + if self.intype == "g": + locus = 0 + origin = 0 + end = 0 + lun = 1 + for riga in self.infile: + if "LOCUS" in riga: + locus = 1 + if "ORIGIN" in riga: + origin = 1 + elif origin == 1 and len(riga.split()) >= 7: + lun = 0 + if "//" in riga: + end = 1 + if locus == 0 or origin == 0 or end == 0 or lun == 1: + sys.exit("The input file is not in GenBank format. Please make sure that the file contains at least the LOCUS and ORIGIN fields. The file must also ends with //") + else: + return "ok" + def multi(self): + if self.intype == "p": + if len(self.infile[0].split()) == 2 or len(self.infile[0].split()) == 3: + if int(self.infile[0].split()[0]) > 1: + return "ok" + else: + sys.exit("There is only one sequence in the file") + else: + sys.exit("the input file is not in phylip format.") + if self.intype == "n": + begin = 0 + matrix = 0 + ntax = 0 + if "#NEXUS" in self.infile[0]: + for riga in self.infile: + if "begin data;" in riga.lower(): + begin = 1 + if "matrix" in riga.lower(): + matrix = 1 + if "ntax" in riga.lower(): + r = riga.split() + ntax = int(r[1][5:]) + if begin==1 and matrix == 1: + return "ok" + else: + sys.exit("the input file is not in nexus format.") + if ntax <= 1: + sys.exit("There is only one sequence in the file") + else: + sys.exit("the input file is not in nexus format.") + if self.intype == "f": + count = 0 + for riga in self.infile: + if riga[0] == ">": + count += 1 + if count > 1: + if len(self.infile) < 4: + sys.exit("The input file is not in fasta format. Please check that the first row starts with > and that the sequence starts from the second line") + else: + return "ok" + else: + if count == 1: + sys.exit("The input file is a single-fasta file. Please resubmit the job using the 'single sequence' option") + if count == 0: + sys.exit("The input file is not in fasta format. Please check that the first row starts with > and that the sequence starts from the second line") + if self.intype == "g": + locus = 0 + origin = 0 + end = 0 + lun = 1 + for riga in self.infile: + if "LOCUS" in riga: + locus = 1 + if "ORIGIN" in riga: + origin = 1 + if origin == 1 and len(riga.split()) >= 7: + lun = 0 + if "//" in riga: + end = 1 + if locus == 0 or origin == 0 or end == 0 or lun == 1: + sys.exit("The input file is not in GenBank format. Please make sure that the file contains at least the LOCUS and ORIGIN fields. The file must also ends with //") + else: + return "ok" + + +def main(input,output,inouttype,type): + check = check_fileformat(inouttype,input) + if type == "single": + c = check.single() + if c == "ok": + conv = convertitori(input,inouttype,type,output) + if inouttype == "f-g": + conv.fg() + if inouttype == "g-f": + conv.gf() + if type == "multi": + c = check.multi() + if c == "ok": + conv = convertitori(input,inouttype,type,output) + if inouttype == "f-g": + conv.fg() + if inouttype == "g-f": + conv.gf() + if inouttype == "f-p": + conv.fp() + if inouttype == "f-n": + conv.fn() + if inouttype == "p-f": + conv.pf() + if inouttype == "p-n": + conv.pn() + if inouttype == "n-p": + conv.np() + if inouttype == "n-f": + conv.nf() + output.close() + +if __name__ == "__main__" : + input = open(sys.argv[1],"r").readlines() + output = open(sys.argv[2],"a") + inouttype = sys.argv[3] + type = sys.argv[4] + main(input,output,inouttype,type)