Mercurial > repos > izsam > phylogeny_converter
view converter.py @ 0:37392af48c37 draft default tip
Uploaded
author | izsam |
---|---|
date | Thu, 19 Mar 2015 11:46:50 -0400 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env python import sys import string import os class convertitori: def __init__(self,input,inouttype,type,output): self.input = input self.inouttype = inouttype self.type = type self.output = output def fp(self): count = 0 cseq = 0 fasta = [] for riga in self.input: count += 1 if ">" in riga: f = "" p = self.input.index(riga,count-1) c = 1 y = riga[1:].replace(" ","_") if y >= 10: f = f + y[:10] + '\t' else: f = f + y + "_"*(10-len(y)) + '\t' try: while ">" not in self.input[p+c]: f = f + (self.input[p+c].strip()) c += 1 except: pass fasta.append(f) num = str(len(fasta)) lun = str(len(fasta[0].split("\t")[1])) for sequence in fasta: if str(len(sequence.split("\t")[1])) != lun: sys.exit("The input file does not contains a multiple alignment in fasta format. Please ensure that all the sequences have the same length") self.output.write(num + '\t' + lun + '\n') for seq in fasta: self.output.write(seq + '\n') def fn(self): count = 0 fasta = [] for riga in self.input: count += 1 if ">" in riga: f = "" p = self.input.index(riga,count-1) c = 1 y = riga[1:].replace(" ","_") if y >= 10: f = f + y[:10] + '\t' else: f = f + y + "_"*(10-len(y)) + '\t' try: while ">" not in self.input[p+c]: f = f + (self.input[p+c].strip()) c += 1 except: pass fasta.append(f) num = str(len(fasta)) lun = str(len(fasta[0].split("\t")[1])) for sequence in fasta: if str(len(sequence.split("\t")[1])) != lun: sys.exit("The input file does not contains a multiple alignment in fasta format. Please ensure that all the sequences have the same length") self.output.write("#NEXUS\n\nBEGIN DATA;\nDIMENSIONS NTAX=%s NCHAR=%s;\nFORMAT DATATYPE=DNA INTERLEAVE MISSING=-;\n\nMATRIX\n"%(num,lun)) porzioni = int(lun)/100 for volte in range(porzioni): for seq in fasta: part = "" self.output.write(seq.split("\t")[0] + '\t') cont = 0 for chara in seq.split("\t")[1][volte*100:(volte+1)*100]: cont += 1 part = part + chara if cont%20.0 == 0: part = part + " " part = part + "\n" self.output.write(part) self.output.write("\n\n\n") for seq in fasta: part = "" cont = 0 self.output.write(seq.split("\t")[0] + '\t') for chara in seq.split("\t")[1][(volte+1)*100:]: cont += 1 part = part + chara if cont%20.0 == 0: part = part + " " part = part + "\n" self.output.write(part) def pn(self): num = int(self.input[0].split()[0]) lun = float(self.input[0].split()[1]) lunf = float(len(self.input)) self.output.write("#NEXUS\n\nBEGIN DATA;\nDIMENSIONS NTAX=%s NCHAR=%s;\nFORMAT DATATYPE=DNA INTERLEAVE MISSING=-;\n\nMATRIX\n"%(int(num),lun)) spia = 0 porzioni = int(lun)/100 if (lunf-1)/num == 1.0: spia = 1 if spia == 1: for volte in range(porzioni): for seq in self.input[1:]: part = "" self.output.write(seq.split("\t")[0] + '\t') cont = 0 for chara in seq.split("\t")[1][volte*100:(volte+1)*100]: cont += 1 part = part + chara if cont%20.0 == 0: part = part + " " part = part + "\n" self.output.write(part) self.output.write("\n\n\n") for seq in self.input[1:]: part = "" cont = 0 self.output.write(seq.split("\t")[0] + '\t') for chara in seq.split("\t")[1][(volte+1)*100:]: cont += 1 part = part + chara if cont%20.0 == 0: part = part + " " part = part + "\n" self.output.write(part) else: if len(self.input[1])<=11: for volte in range(porzioni): interm = 0 for seq in self.input[1:]: if seq == "\n": interm += 1 if (self.input.index(seq)+interm)%2 == 0 and seq != "\n": part = "" cont = 0 for chara in seq[volte*100:(volte+1)*100]: cont += 1 part = part + chara if cont%20.0 == 0: part = part + " " part = part + "\n" self.output.write(part) elif (self.input.index(seq)+interm)%2 != 0 and seq != "\n": self.output.write(seq[:10] + "\t") self.output.write("\n\n\n") interm = 0 for seq in self.input[1:]: if seq == "\n": interm += 1 if (self.input.index(seq)+interm)%2 == 0 and seq != "\n": part = "" cont = 0 for chara in seq[(volte+1)*100:]: cont += 1 part = part + chara if cont%20.0 == 0: part = part + " " part = part + "\n" self.output.write(part) elif (self.input.index(seq)+interm)%2 != 0 and seq != "\n": self.output.write(seq[:10] + "\t") else: try: diz = {} volta = 0 for riga in self.input[1:]: if self.input.index(riga) in range(num+1): numriga = self.input.index(riga) diz[self.input.index(riga)] = [self.input[numriga][:10],self.input[numriga][10:].strip().replace(" ","")] else: if riga == "\n": volta += 1 else: numriga = self.input.index(riga) prima = diz[numriga - ((num+1)*volta)][1] + self.input[self.input.index(riga)].strip().replace(" ","") diz[numriga - ((num+1)*volta)] = [diz[numriga - ((num+1)*volta)][0],prima] for volte in range(porzioni): for seq in diz.keys(): cont = 0 self.output.write(diz[seq][0] + "\t") for chara in diz[seq][1][volte*100:(volte+1)*100]: self.output.write(chara) cont += 1 if cont%20.0 == 0: self.output.write(" ") self.output.write("\n") self.output.write("\n\n\n") for seq in diz.keys(): cont = 0 self.output.write(diz[seq][0] + "\t") for chara in diz[seq][1][(volte+1)*100:]: self.output.write(chara) cont += 1 if cont%20.0 == 0: self.output.write(" ") self.output.write("\n") except: sys.exit("The input file is not in the proper format. Please check that your file is in Phylip standard interleaved (or sequential) format ") def pf(self): num = int(self.input[0].split()[0]) lun = float(len(self.input)) spia = 0 if (lun-1)/num == 1.0: spia = 1 if spia == 1: for riga in self.input[1:]: for ele in range(int(lun-1)): cont = 0 self.output.write(">" + self.input[ele+1][:10] + "\n") for char in self.input[ele+1][10:].strip().replace(" ",""): self.output.write(char) cont += 1 if cont%80.0 == 0: self.output.write('\n') self.output.write('\n') else: if len(self.input[1])<=11: interm = 0 for riga in self.input[1:]: if riga == "\n": interm += 1 if (self.input.index(riga)+interm)%2 == 0 and riga != "\n": cont = 0 for char in riga.strip().replace(" ",""): self.output.write(char) cont += 1 if cont%80.0 == 0: self.output.write('\n') self.output.write('\n') elif (self.input.index(riga)+interm)%2 != 0 and riga != "\n": self.output.write(">" + riga[:10] + "\n") else: try: diz = {} volta = 0 for riga in self.input[1:]: if self.input.index(riga) in range(num+1): numriga = self.input.index(riga) diz[self.input.index(riga)] = [self.input[numriga][:10],self.input[numriga][10:].strip().replace(" ","")] else: if riga == "\n": volta += 1 else: numriga = self.input.index(riga) prima = diz[numriga - ((num+1)*volta)][1] + self.input[self.input.index(riga)].strip().replace(" ","") diz[numriga - ((num+1)*volta)] = [diz[numriga - ((num+1)*volta)][0],prima] for elemento in diz.keys(): self.output.write(">" + diz[elemento][0] + '\n') con = 0 for char in diz[elemento][1]: self.output.write(char) con += 1 if con%80 == 0: self.output.write('\n') self.output.write('\n') except: sys.exit("The input file is not in the proper format. Please check that your file is in Phylip standard interleaved (or sequential) format ") def nf(self): try: diz = {} spia = 0 for riga in self.input: if "MATRIX" in riga: spia = 1 if spia == 1 and "MATRIX" not in riga and riga != "\n": if riga.split()[0] not in diz.keys(): diz[riga.split()[0]] = "" else: for ele in riga.split()[1:]: diz[riga.split()[0]] = diz[riga.split()[0]] + ele.strip() for elemento in diz.keys(): self.output.write(">" + elemento + '\n') con = 0 for char in diz[elemento]: self.output.write(char) con += 1 if con%80 == 0: self.output.write('\n') self.output.write('\n') except: sys.exit("The input file is not in Nexus format. ") def np(self): try: diz = {} spia = 0 for riga in self.input: if "MATRIX" in riga: spia = 1 if spia == 1 and "MATRIX" not in riga and riga != "\n": if riga.split()[0] not in diz.keys(): diz[riga.split()[0]] = "" else: for ele in riga.split()[1:]: diz[riga.split()[0]] = diz[riga.split()[0]] + ele.strip() num = str(len(diz.keys())) lun = str(len(diz.values()[0])) self.output.write(num + '\t' + lun + '\n') for elemento in diz.keys(): if elemento >= 10: nome = elemento[:10] + '\t' else: nome = elemento + "_"*(10-len(elemento)) + '\t' self.output.write(nome + diz[elemento] + '\n') except: sys.exit("The input file is not in Nexus format. ") def fg(self): count = 0 fasta = [] for riga in self.input: count += 1 if ">" in riga: f = "" p = self.input.index(riga,count-1) c = 1 y = riga[1:-1].replace(" ","_") f = f + y + '\t' try: while ">" not in self.input[p+c]: f = f + (self.input[p+c].strip()) c += 1 except: pass fasta.append(f) for seq in fasta: lun = str(len(seq.split("\t")[1])) self.output.write("LOCUS\t%s\t%s bp\nORIGIN\n"%(seq.split("\t")[0],lun)) porzioni = int(lun)/60 cont = 0 for volte in range(porzioni): part = "" self.output.write(str(cont+1) + "\t") for chara in seq.split("\t")[1][volte*60:(volte+1)*60]: cont += 1 part = part + chara if cont%10.0 == 0: part = part + " " self.output.write(part) self.output.write("\n") self.output.write(str(cont+1) + "\t") part = "" for chara in seq.split("\t")[1][(volte+1)*60:]: cont += 1 part = part + chara if cont%10.0 == 0: part = part + " " self.output.write(part) self.output.write("\n") self.output.write("//\n\n") def gf(self): for riga in self.input: if "LOCUS" in riga: nome = "" spia = 0 len = "" seq = "" part = riga.split() for ele in part: if "bp" in ele: len = str(riga.index(ele)-1) nome = part[1] + '\t' if "DEFINITION" in riga: part = riga.split() for ele in part[1:]: nome = nome + ele + ' ' if "ORIGIN" in riga: spia = 1 if spia == 1 and "ORIGIN" not in riga: part = riga.split() for ele in part[1:]: seq = seq + ele.strip() if "//" in riga: self.output.write(">" + nome + '\n') con = 0 for char in seq: self.output.write(char) con += 1 if con%80 == 0: self.output.write('\n') self.output.write('\n') spia = 0 class check_fileformat: def __init__(self,inouttype,input): self.intype = inouttype[0] self.infile = input def single(self): if self.intype == "f": count = 0 for riga in self.infile: if riga[0] == ">": count += 1 if count == 1: if len(self.infile) < 2: sys.exit("The input file is not in fasta format. Please check that the first row starts with > and that the sequence starts from the second line") else: return "ok" else: if count >1: sys.exit("The input file is a multi-fasta file. Please resubmit the job using the 'multi sequence' option") if count == 0: sys.exit("The input file is not in fasta format. Please check that the first row starts with > and that the sequence starts from the second line") if self.intype == "g": locus = 0 origin = 0 end = 0 lun = 1 for riga in self.infile: if "LOCUS" in riga: locus = 1 if "ORIGIN" in riga: origin = 1 elif origin == 1 and len(riga.split()) >= 7: lun = 0 if "//" in riga: end = 1 if locus == 0 or origin == 0 or end == 0 or lun == 1: sys.exit("The input file is not in GenBank format. Please make sure that the file contains at least the LOCUS and ORIGIN fields. The file must also ends with //") else: return "ok" def multi(self): if self.intype == "p": if len(self.infile[0].split()) == 2 or len(self.infile[0].split()) == 3: if int(self.infile[0].split()[0]) > 1: return "ok" else: sys.exit("There is only one sequence in the file") else: sys.exit("the input file is not in phylip format.") if self.intype == "n": begin = 0 matrix = 0 ntax = 0 if "#NEXUS" in self.infile[0]: for riga in self.infile: if "begin data;" in riga.lower(): begin = 1 if "matrix" in riga.lower(): matrix = 1 if "ntax" in riga.lower(): r = riga.split() ntax = int(r[1][5:]) if begin==1 and matrix == 1: return "ok" else: sys.exit("the input file is not in nexus format.") if ntax <= 1: sys.exit("There is only one sequence in the file") else: sys.exit("the input file is not in nexus format.") if self.intype == "f": count = 0 for riga in self.infile: if riga[0] == ">": count += 1 if count > 1: if len(self.infile) < 4: sys.exit("The input file is not in fasta format. Please check that the first row starts with > and that the sequence starts from the second line") else: return "ok" else: if count == 1: sys.exit("The input file is a single-fasta file. Please resubmit the job using the 'single sequence' option") if count == 0: sys.exit("The input file is not in fasta format. Please check that the first row starts with > and that the sequence starts from the second line") if self.intype == "g": locus = 0 origin = 0 end = 0 lun = 1 for riga in self.infile: if "LOCUS" in riga: locus = 1 if "ORIGIN" in riga: origin = 1 if origin == 1 and len(riga.split()) >= 7: lun = 0 if "//" in riga: end = 1 if locus == 0 or origin == 0 or end == 0 or lun == 1: sys.exit("The input file is not in GenBank format. Please make sure that the file contains at least the LOCUS and ORIGIN fields. The file must also ends with //") else: return "ok" def main(input,output,inouttype,type): check = check_fileformat(inouttype,input) if type == "single": c = check.single() if c == "ok": conv = convertitori(input,inouttype,type,output) if inouttype == "f-g": conv.fg() if inouttype == "g-f": conv.gf() if type == "multi": c = check.multi() if c == "ok": conv = convertitori(input,inouttype,type,output) if inouttype == "f-g": conv.fg() if inouttype == "g-f": conv.gf() if inouttype == "f-p": conv.fp() if inouttype == "f-n": conv.fn() if inouttype == "p-f": conv.pf() if inouttype == "p-n": conv.pn() if inouttype == "n-p": conv.np() if inouttype == "n-f": conv.nf() output.close() if __name__ == "__main__" : input = open(sys.argv[1],"r").readlines() output = open(sys.argv[2],"a") inouttype = sys.argv[3] type = sys.argv[4] main(input,output,inouttype,type)