Mercurial > repos > bornea > merge_scaffold
view MergeFiles.py @ 4:c1b96e4abf8d draft
Uploaded
author | bornea |
---|---|
date | Wed, 03 Aug 2016 14:55:07 -0400 |
parents | c1199500c601 |
children | 008bc143e72e |
line wrap: on
line source
""" Python-code: Merge Scaffold Samples Report files @author = Brent Kuenzi @email = Brent.Kuenzi@moffitt.org """ ####################################################################################### import sys import urllib2 import os.path import pandas as pd ####################################################################################### ## Description: ## # This program will merge either 2 or 3 scaffold # sample report files together ## Required input: ## infile1 = sys.argv[1] # scaffold report #1 -- filename infile2 = sys.argv[2] # scaffold report #2 -- filename infile3 = sys.argv[3] # scaffold report #3 -- filename or "False" baitfile = sys.argv[4] # Bait file -- filename outfile = sys.argv[5] # output filename class ScaffoldReturn(object): def __init__(self, getdata, getproteins, getheader): self.data = getdata self.proteins = getproteins self.header = getheader class ProteinInfo(object): def __init__(self, getMW, getDescr): self.mw = getMW self.descr = getDescr def readtab(infile): with open(infile,'r') as x: # read in tab-delim text output = [] for line in x: line = line.strip() temp = line.split('\t') output.append(temp) return output def read_scaffold(scaffold_input): # Get data, proteins and header from scaffold output dupes = readtab(scaffold_input) cnt = 0 for i in dupes: if "Accession Number" in i: # finds the start of header header_start = cnt break cnt += 1 header = dupes[header_start] prot_start = header.index("Accession Number") data = dupes[header_start+1:len(dupes)-2] # cut off blank line and END OF FILE proteins = [] for i in data: i[prot_start] = i[prot_start].split()[0] # removes the (+##) that sometimes is attached for protein in data: proteins.append(protein[prot_start]) return ScaffoldReturn(data, proteins, header) def MakeDF(scaffold_input,bait_input): bait = readtab(bait_input) data = read_scaffold(scaffold_input).data header = read_scaffold(scaffold_input).header proteins = read_scaffold(scaffold_input).proteins prot_start = header.index("Accession Number") bait_index = [] ind = [] for i in bait: if i[0] in header: bait_index.append(header.index(i[0])) # Find just the baits defined in bait file ind.append(i[0]) frames = {} for i in proteins: protein = i.split()[0] if protein not in frames: frames[protein] = [] # create dictionary of proteins for each bait value for i in data: temp = i[prot_start] protein = temp.split()[0] for j in bait_index: # create dataframe frames[protein].append(i[j]) df = pd.DataFrame(frames,index=ind) return df def get_info(input1,input2,input3): files = [input1,input2] molwt = {} protdesc = {} if input3 != "False": files.append(input3) for i in files: data = read_scaffold(i).data header = read_scaffold(i).header prot_start = header.index("Accession Number") # find header prot = 0 mw = 0 cnt=0 for j in header: # find info if "Identified" in j: prot=cnt if "Molecular Weight" in j: mw=cnt cnt+=1 for k in data: # append info temp = k[prot_start] protein = temp.split()[0] if protein not in protdesc: protdesc[protein] = k[prot] molwt[protein] = k[mw] mw = pd.DataFrame(molwt,index=["Molecular Weight"]) descr = pd.DataFrame(protdesc,index=["Identified Proteins"]) return ProteinInfo(mw,descr) # return info as dataframe def MakeFile(input1,input2,input3,bait_input): files = [input1,input2] if input3 != "False": files.append(input3) DFs = [] DFs.append(get_info(input1,input2,input3).descr) DFs.append(get_info(input1,input2,input3).mw) for i in files: DFs.append(MakeDF(i,bait_input)) final_df = pd.concat(DFs) temp = final_df.T temp.index.name = "Accession Number" output = temp.fillna(0) output["Accession Number"] = output.index.get_values() output.index = range(1, len(output.index) + 1) output.index.name = "#" output.to_csv(outfile,sep="\t") ####################################################################################### MakeFile(input1 = infile1,input2= infile2,input3 = infile3,bait_input=baitfile, outfile="merged.txt") #######################################################################################