Mercurial > repos > bornea > saint_preproc
changeset 4:378db8ea92ea draft
Deleted selected files
author | bornea |
---|---|
date | Tue, 10 Nov 2015 13:13:51 -0500 |
parents | 6571324e3d2c |
children | 2b631809150b |
files | saint_preproc/SAINT_preprocessing_v5.xml saint_preproc/SAINT_preprocessing_v6.py saint_preproc/SAINT_preprocessing_v6_mq_pep.py saint_preproc/pre_process_protein_name_set.R saint_preproc/tool_dependencies.xml |
diffstat | 5 files changed, 0 insertions(+), 681 deletions(-) [+] |
line wrap: on
line diff
--- a/saint_preproc/SAINT_preprocessing_v5.xml Tue Nov 10 13:13:22 2015 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,91 +0,0 @@ -<tool id="SAINT_preprocessing_v5" name="SAINT pre-processing"> - <description></description> - <command interpreter="python"> - #if (str($type) == 'Scaffold'): - SAINT_preprocessing_v6.py $input $preybool $fasta_db $Inter_file $Prey_file - " - #for $ba in $bait - ${ba.bait1} - ${ba.assign} - ${ba.T_C} - #end for - " - $Bait_file \$INSTALL_RUN_PATH/ - #elif (str($type) == 'MaxQuant'): - SAINT_preprocessing_v6_mq_pep.py $input $preybool $fasta_db $Inter_file $Prey_file - " - #for $ba in $bait - ${ba.bait1} - ${ba.assign} - ${ba.T_C} - #end for - " - $Bait_file \$INSTALL_RUN_PATH/ - #end if - </command> - <requirements> - <requirement type="set_environment">INSTALL_RUN_PATH</requirment> - </requirments> - <inputs> - <param type="select" name="type" label="MaxQuant or Scaffold"> - <option value="MaxQuant">MaxQuant</option> - <option value="Scaffold">Scaffold</option> - </param> - <param format="dat" name="input" type="data" label="Scaffold or MaxQuant proteinGroup Output"/> - <param type="boolean" name="preybool" checked="true" label="Create Prey File"/> - <param type="data" name="fasta_db" format="fasta" label="Provide Uniprot Fasta database" /> - <repeat name="bait" title="Bait Create"> - <param name="bait1" type="text" size="100"/> - <param name="assign" type="text" size="100"/> - <param name="T_C" type="boolean" checked="true" label="Is this a Control?"/> - </repeat> - </inputs> - <outputs> - <data format="txt" name="Inter_file" label="Inter File"/> - <data format="txt" name="Prey_file" label="Prey File" /> - <data format="txt" name="Bait_file" label="Bait File" /> - </outputs> - <stdio> - <regex match="error" - source="stdout" - level="fatal" - description="Unknown error"/> - <regex match="Error: bad bait" - source="stdout" - level="fatal" - description="Error: bad bait"/> - </stdio> - - <tests> - <test> - <param name="input" value="fa_gc_content_input.fa"/> - <output name="out_file1" file="fa_gc_content_output.txt"/> - </test> - </tests> - <help> -Pre-processing: -APOSTL is able to recognize either a Scaffold "Samples Report" file (tab-delimited -txt file) or the "peptides.txt" file output in the maxquant "txt" output folder. No -modifications should be made to these files. Using the "Bait Create" tool, you can -create your "bait.txt" file. It is important that the individual bait names match the -bait names within your scaffold or maxquant output. APOSTL uses the bait file to find -the user's baits of interest. Additionally there is an option to make the prey file (Y/N). -When making a prey file, APOSTL queries uniprot in order to extract protein amino acid -lengths and gene names. This takes several minutes depending on your internet connection. -Some users may want to run SAINTexpress using the same data set while changing which baits -are considered test or control. It is useful to toggle "Make Prey" off in order to save -time by circumventing this step as the same prey file can be used for both SAINTexpress -runs. - -INPUTS: - -Scaffold file: - -- Scaffold "Samples Report" output (tab-delimited txt file) - - -Maxquant file: - -- maxquant "peptides.txt" file (tab-delimited txt file) - </help> -</tool>
--- a/saint_preproc/SAINT_preprocessing_v6.py Tue Nov 10 13:13:22 2015 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,245 +0,0 @@ -####################################################################################### -# Python-code: SAINT pre-processing from Scaffold "Samples Report" output -# Author: Brent Kuenzi -####################################################################################### -# This program reads in a raw Scaffold "Samples Report" output and a user generated -# bait file and autoformats it into prey and interaction files for SAINTexpress -# analysis -####################################################################################### -import sys -import urllib2 -import os.path -####################################################################################### -## REQUIRED INPUT ## - -# 1) infile: Scaffold "Samples Report" output -# 2) baitfile: SAINT formatted bait file generated in Galaxy -# 3) prey: Y or N for generating a prey file (requires internet connection) -####################################################################################### -infile = sys.argv[1] #Scaffold "Samples Report" output -prey = sys.argv[2] # Y or N -fasta_db = sys.argv[3] -tool_path = sys.argv[8] -if fasta_db == "None": - fasta_db = str(tool_path) + "/SwissProt_HUMAN_2014_08.fasta" -make_bait= sys.argv[5] - - -baits = make_bait.split() -i = 0 -bait_file_tmp = open("bait.txt", "wr") -order = [] -bait_cache = [] - -while i < len(baits): - if baits[i+2] == "true": - T_C = "C" - else: - T_C = "T" - line1 = baits[i] + "\t" + baits[i+1] + "\t" + T_C + "\n" - q = open(infile,"r") - for line2 in q: - line2 = line2.strip() - temp = line2.split('\t') - if "Quantitative Variance" in str(temp): - if baits[i] in temp: - number_bait = temp.index(str(baits[i])) - number_bait = number_bait - 9 - bait_cache.append((number_bait, str(line1))) - else: - print "Error: bad bait " + str(baits[i]) - sys.exit() - else: - pass - i = i + 3 - -bait_cache.sort() -for line in bait_cache: - bait_file_tmp.write(line[1]) - -bait_file_tmp.close() -baitfile = "bait.txt" - -class ReturnValue1(object): - def __init__(self, sequence, gene): - self.seqlength = sequence - self.genename = gene -class ReturnValue2(object): - def __init__(self, getdata, getproteins, getheader): - self.data = getdata - self.proteins = getproteins - self.header = getheader - -def main(scaffold_input, baitfile): - bait_check(baitfile, scaffold_input) - make_inter(scaffold_input) - if prey == 'true': - make_prey(scaffold_input) - no_error_inter(scaffold_input) - os.rename('prey.txt', sys.argv[5]) - elif prey == 'false': - if os.path.isfile('error proteins.txt') == True: - no_error_inter(scaffold_input) - pass - elif prey != 'true' or 'false': - sys.exit("Invalid Prey Argument: Y or N") - -def get_info(uniprot_accession_in): # get aa lengths and gene name - error = open('error proteins.txt', 'a+') -# while True: -# i = 0 -# try: -# data = urllib2.urlopen("http://www.uniprot.org/uniprot/" + uniprot_accession_in + ".fasta") -# break -# except urllib2.HTTPError, err: -# i = i + 1 -# if i == 50: -# sys.exit("More than 50 errors. Check your file or try again later.") -# if err.code == 404: -# error.write(uniprot_accession_in + '\t' + "Invalid URL. Check protein" + '\n') -# seqlength = 'NA' -# genename = 'NA' -# return ReturnValue1(seqlength, genename) -# elif err.code == 302: -# sys.exit("Request timed out. Check connection and try again.") -# else: -# sys.exit("Uniprot had some other error") -# lines = data.readlines() -# if lines == []: -# error.write(uniprot_accession_in + '\t' + "Blank Fasta" + '\n') -# error.close -# seqlength = 'NA' -# genename = 'NA' -# return ReturnValue1(seqlength, genename) -# if lines != []: -# seqlength = 0 -# header = lines[0] -# for line in lines[1:]: -# line = line.replace("\n","") # strip \n or else it gets counted in the length -# seqlength += len(line) -# if 'GN=' in header: -# lst = header.split('GN=') -# lst2 = lst[1].split(' ') -# genename = lst2[0] -# error.close -# return ReturnValue1(seqlength, genename) -# if 'GN=' not in header: -# genename = 'NA' -# error.close -# return ReturnValue1(seqlength, genename) - data = open(fasta_db,'r') - lines = data.readlines() - db_len = len(lines) - seqlength = 0 - count = 0 - for i in lines: - if ">sp" in i: - if uniprot_accession_in == i.split("|")[1]: - match = count+1 - if 'GN=' in i: - lst = i.split('GN=') - lst2 = lst[1].split(' ') - genename = lst2[0] - if 'GN=' not in i: - genename = 'NA' - while ">sp" not in lines[match]: - if match <= db_len: - seqlength = seqlength + len(lines[match].strip()) - match = match + 1 - else: - break - return ReturnValue1(seqlength, genename) - count = count + 1 - - - if seqlength == 0: - error.write(uniprot_accession_in + '\t' + "Uniprot not in Fasta" + '\n') - error.close - seqlength = 'NA' - genename = 'NA' - return ReturnValue1(seqlength, genename) - -def readtab(infile): - with open(infile,'r') as x: # read in tab-delim text - output = [] - for line in x: - line = line.strip() - temp = line.split('\t') - output.append(temp) - return output -def read_scaffold(scaffold_input): # Get data, proteins and header from scaffold output - dupes = readtab(scaffold_input) - cnt = 0 - for i in dupes: - cnt += 1 - if i[0] == '#': # finds the start of second header - header_start = cnt-1 - header = dupes[header_start] - prot_start = header.index("Accession Number") - data = dupes[header_start+1:len(dupes)-2] # cut off blank line and END OF FILE - proteins = [] - for i in data: - i[4] = i[4].split()[0] # removes the (+##) that sometimes is attached - for protein in data: - proteins.append(protein[prot_start]) - return ReturnValue2(data, proteins, header) -def make_inter(scaffold_input): - bait = readtab(baitfile) - data = read_scaffold(scaffold_input).data - header = read_scaffold(scaffold_input).header - proteins = read_scaffold(scaffold_input).proteins - bait_index = [] - for i in bait: - bait_index.append(header.index(i[0])) # Find just the baits defined in bait file - with open('inter.txt', 'w') as y: - a = 0; l=0 - for bb in bait: - for lst in data: - y.write(header[bait_index[l]] + '\t' + bb[1] + '\t' + proteins[a] + '\t' + lst[bait_index[l]] + '\n') - a+=1 - if a == len(proteins): - a = 0; l+=1 -def make_prey(scaffold_input): - proteins = read_scaffold(scaffold_input).proteins - output_file = open("prey.txt",'w') - for a in proteins: - a = a.replace("\n","") # remove \n for input into function - a = a.replace("\r","") # ditto for \r - seq = get_info(a).seqlength - GN = get_info(a).genename - if seq != 'NA': - output_file.write(a+"\t"+str(seq)+ "\t" + str(GN) + "\n") - output_file.close() -def no_error_inter(scaffold_input): # remake inter file without protein errors from Uniprot - err = readtab("error proteins.txt") - bait = readtab(baitfile) - data = read_scaffold(scaffold_input).data - header = read_scaffold(scaffold_input).header - bait_index = [] - for i in bait: - bait_index.append(header.index(i[0])) - proteins = read_scaffold(scaffold_input).proteins - errors = [] - for e in err: - errors.append(e[0]) - with open('inter.txt', 'w') as y: - l = 0; a = 0 - for bb in bait: - for lst in data: - if proteins[a] not in errors: - y.write(header[bait_index[l]] + '\t' + bb[1] + '\t' + proteins[a] + '\t' + lst[bait_index[l]] + '\n') - a+=1 - if a == len(proteins): - l += 1; a = 0 -def bait_check(bait, scaffold_input): # check that bait names share header titles - bait_in = readtab(bait) - header = read_scaffold(scaffold_input).header - for i in bait_in: - if i[0] not in header: - sys.exit("Bait must share header titles with Scaffold output") - -if __name__ == '__main__': - main(infile, baitfile) - -os.rename('inter.txt', sys.argv[4]) -os.rename("bait.txt", sys.argv[7])
--- a/saint_preproc/SAINT_preprocessing_v6_mq_pep.py Tue Nov 10 13:13:22 2015 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,262 +0,0 @@ -####################################################################################### -# Python-code: SAINT pre-processing from maxquant "Samples Report" output -# Author: Brent Kuenzi -####################################################################################### -# This program reads in a raw maxquant "Samples Report" output and a user generated -# bait file and autoformats it into prey and interaction files for SAINTexpress -# analysis -####################################################################################### -import sys -import urllib2 -import os -####################################################################################### -## REQUIRED INPUT ## - -# 1) infile: maxquant "Samples Report" output -# 2) baitfile: SAINT formatted bait file generated in Galaxy -# 3) prey: Y or N for generating a prey file (requires internet connection) -####################################################################################### -mq_file = sys.argv[1] -cmd = r"Rscript /home/bornea/galaxy_moffitt_dev/tools/Moffitt_Tools/bubblebeam/pre_process_protein_name_set.R " + str(mq_file) -os.system(cmd) - -infile = "./tukeys_output.txt" #maxquant "Samples Report" output -prey = sys.argv[2] # Y or N -fasta_db = sys.argv[3] -if fasta_db == "None": - fasta_db = "/home/bornea/galaxy_moffitt_dev/tools/Moffitt_Tools/bubblebeam/SwissProt_HUMAN_2014_08.fasta" -make_bait= sys.argv[6] - -def bait_create(baits, infile): - #Takes the Bait specified by the user and makes them into a Bait file and includes a check to make sure they are using valid baits. - baits = make_bait.split() - i = 0 - bait_file_tmp = open("bait.txt", "wr") - order = [] - bait_cache = [] - while i < len(baits): - if baits[i+2] == "true": - T_C = "C" - else: - T_C = "T" - line1 = baits[i] + "\t" + baits[i+1] + "\t" + T_C + "\n" - q = open(infile,"r") - for line2 in q: - line2 = line2.replace("\"", "") - line2 = line2.replace(r"Intensity.", "") #R coerces "-" into "." this changes them back and remove Intensity from the Bait names. - line2 = line2.replace(r".", r"-") - temp = line2.split() - if "mapped_protein" in str(temp): - #If the bait is in the original file then write to cache it if not exit. - if baits[i] in temp: - number_bait = temp.index(str(baits[i])) - number_bait = number_bait - 9 - bait_cache.append((number_bait, str(line1))) - else: - print "Error: bad bait " + str(baits[i]) - sys.exit() - else: - pass - i = i + 3 - #Writes cache to file. - bait_cache.sort() - for line in bait_cache: - bait_file_tmp.write(line[1]) - - bait_file_tmp.close() - - -baitfile = "bait.txt" - -class ReturnValue1(object): - def __init__(self, sequence, gene): - self.seqlength = sequence - self.genename = gene -class ReturnValue2(object): - def __init__(self, getdata, getproteins, getheader): - self.data = getdata - self.proteins = getproteins - self.header = getheader - -def main(maxquant_input, make_bait): - bait_create(make_bait, infile) - baitfile = "bait.txt" - #bait_check(baitfile, maxquant_input) - make_inter(maxquant_input) - if prey == 'true': - make_prey(maxquant_input) - no_error_inter(maxquant_input) - os.rename('prey.txt', sys.argv[5]) - elif prey == 'false': - if os.path.isfile('error proteins.txt') == True: - no_error_inter(maxquant_input) - pass - elif prey != 'true' or 'false': - sys.exit("Invalid Prey Argument: Y or N") - os.rename('inter.txt', sys.argv[4]) - os.rename("bait.txt", sys.argv[7]) - - -def get_info(uniprot_accession_in): # get aa lengths and gene name - error = open('error proteins.txt', 'a+') -# while True: -# i = 0 -# try: -# data = urllib2.urlopen("http://www.uniprot.org/uniprot/" + uniprot_accession_in + ".fasta") -# break -# except urllib2.HTTPError, err: -# i = i + 1 -# if i == 50: -# sys.exit("More than 50 errors. Check your file or try again later.") -# if err.code == 404: -# error.write(uniprot_accession_in + '\t' + "Invalid URL. Check protein" + '\n') -# seqlength = 'NA' -# genename = 'NA' -# return ReturnValue1(seqlength, genename) -# elif err.code == 302: -# sys.exit("Request timed out. Check connection and try again.") -# else: -# sys.exit("Uniprot had some other error") -# -# -# if lines == []: -# error.write(uniprot_accession_in + '\t' + "Blank Fasta" + '\n') -# error.close -# if lines == []: -# error.write(uniprot_accession_in + '\t' + "Blank Fasta" + '\n') -# error.close -# seqlength = 'NA' -# genename = 'NA' -# return ReturnValue1(seqlength, genename) -# if lines != []: -# seqlength = 0 -# header = lines[0] -# for line in lines[1:]: -# line = line.replace("\n","") # strip \n or else it gets counted in the length -# seqlength += len(line) -# if 'GN=' in header: -# lst = header.split('GN=') -# lst2 = lst[1].split(' ') -# genename = lst2[0] -# error.close -# return ReturnValue1(seqlength, genename) -# if 'GN=' not in header: -# genename = 'NA' -# error.close -# return ReturnValue1(seqlength, genename) - - data = open(fasta_db,'r') - lines = data.readlines() - db_len = len(lines) - seqlength = 0 - count = 0 - for i in lines: - if ">sp" in i: - if uniprot_accession_in == i.split("|")[1]: - match = count+1 - if 'GN=' in i: - lst = i.split('GN=') - lst2 = lst[1].split(' ') - genename = lst2[0] - if 'GN=' not in i: - genename = 'NA' - while ">sp" not in lines[match]: - if match <= db_len: - seqlength = seqlength + len(lines[match].strip()) - match = match + 1 - else: - break - return ReturnValue1(seqlength, genename) - count = count + 1 - - - if seqlength == 0: - error.write(uniprot_accession_in + '\t' + "Uniprot not in Fasta" + '\n') - error.close - seqlength = 'NA' - genename = 'NA' - return ReturnValue1(seqlength, genename) - - -def readtab(infile): - with open(infile,'r') as x: # read in tab-delim text - output = [] - for line in x: - line = line.strip() - temp = line.split('\t') - output.append(temp) - return output -def read_maxquant(maxquant_input): # Get data, proteins and header from maxquant output - dupes = readtab(maxquant_input) - header_start = 0 - header = dupes[header_start] - for i in header: - i = i.replace(r"\"", "") - i = i.replace(r"Intensity.", r"") - i = i.replace(r".", r"-") - data = dupes[header_start+1:len(dupes)] #cut off blank line and END OF FILE - proteins = [] - for protein in data: - proteins.append(protein[0]) - return ReturnValue2(data, proteins, header) -def make_inter(maxquant_input): - bait = readtab(baitfile) - data = read_maxquant(maxquant_input).data - header = read_maxquant(maxquant_input).header - proteins = read_maxquant(maxquant_input).proteins - bait_index = [] - for i in bait: - bait_index.append(header.index("mapped_protein") + 1) # Find just the baits defined in bait file - with open('inter.txt', 'w') as y: - a = 0; l=0 - for bb in bait: - for lst in data: - y.write(header[bait_index[l]] + '\t' + bb[1] + '\t' + proteins[a] + '\t' + lst[bait_index[l]] + '\n') - a+=1 - if a == len(proteins): - a = 0; l+=1 -def make_prey(maxquant_input): - proteins = read_maxquant(maxquant_input).proteins - output_file = open("prey.txt",'w') - for a in proteins: - a = a.replace("\n","") # remove \n for input into function - a = a.replace("\r","") # ditto for \r - seq = get_info(a).seqlength - GN = get_info(a).genename - if seq != 'NA': - output_file.write(a+"\t"+str(seq)+ "\t" + str(GN) + "\n") - output_file.close() -def no_error_inter(maxquant_input): # remake inter file without protein errors from Uniprot - err = readtab("error proteins.txt") - bait = readtab(baitfile) - data = read_maxquant(maxquant_input).data - header = read_maxquant(maxquant_input).header - header = [i.replace(r"\"", "") for i in header] - header = [i.replace(r"Intensity.", r"") for i in header] - header = [i.replace(r".", r"-") for i in header] - print header - bait_index = [] - for i in bait: - bait_index.append(header.index(i[0])) - proteins = read_maxquant(maxquant_input).proteins - errors = [] - for e in err: - errors.append(e[0]) - with open('inter.txt', 'w') as y: - l = 0; a = 0 - for bb in bait: - for lst in data: - if proteins[a] not in errors: - y.write(header[bait_index[l]] + '\t' + bb[1] + '\t' + proteins[a] + '\t' + lst[bait_index[l]] + '\n') - a+=1 - if a == len(proteins): - l += 1; a = 0 -def bait_check(bait, maxquant_input): # check that bait names share header titles - bait_in = readtab(bait) - header = read_maxquant(maxquant_input).header - for i in bait_in: - if i[0] not in header: - sys.exit("Bait must share header titles with maxquant output") - -if __name__ == '__main__': - main(infile, make_bait)
--- a/saint_preproc/pre_process_protein_name_set.R Tue Nov 10 13:13:22 2015 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,77 +0,0 @@ -library(data.table) -library(affy) -library(stringr) -library(mygene) -library(VennDiagram) -##### -#data -main <- function(peptides_file) { - peptides_file = read.delim(peptides_file,header=TRUE,stringsAsFactors=FALSE,fill=TRUE) - peptides_txt = peptides_file - intensity_columns = names(peptides_txt[,str_detect(names(peptides_txt),"Intensity\\.*")]) #Pulls out all lines with Intensity in them. - intensity_columns = intensity_columns[2:length(intensity_columns)] #Removes the first column that does not have a bait. - peptides_txt_mapped = as.data.frame(map_peptides_proteins(peptides_txt)) #This function as below sets every line to a 1 to 1 intensity to each possible protein. - peptides_txt_mapped$Uniprot = str_extract(peptides_txt_mapped$mapped_protein, "[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}") #Pulls out just Uniprot id from the script. - peptides_txt_mapped = subset(peptides_txt_mapped,!is.na(Uniprot)) #removes reverse sequences and any that didn't match a uniprot accession - columns_comb = c("Uniprot", intensity_columns) - peptides_mapped_intensity = subset(peptides_txt_mapped, select = columns_comb) #Subsets out only the needed cloumns for Tukeys (Uniprot IDS and baited intensities) - swissprot_fasta = scan("/home/philip/galaxy/tools/Moffitt_Tools/uniprot_names.txt",what="character") - peptides_txt_mapped_log2 = peptides_mapped_intensity - # Takes the log2 of the intensities. - for (i in intensity_columns) { - peptides_txt_mapped_log2[,i] = log2(subset(peptides_txt_mapped_log2, select = i)) - } - #get the minimum from each column while ignoring the -Inf; get the min of these mins for the global min; breaks when there's only one intensity column - global_min = min(apply(peptides_txt_mapped_log2[,2:ncol(peptides_txt_mapped_log2)],2,function(x) { - min(x[x != -Inf]) - })) - peptides_txt_mapped_log2[peptides_txt_mapped_log2 == -Inf] <- 0 - #uniprot accessions WITHOUT isoforms; it looks like only contaminants contain isoforms anyways - mapped_protein_uniprotonly = str_extract(peptides_txt_mapped_log2$Uniprot,"[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}") - mapped_protein_uniprot_accession = str_extract(peptides_txt_mapped_log2$Uniprot,"[OPQ][0-9][A-Z0-9]{3}[0-9](-[0-9]+)?|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}(-[0-9]+)?|[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}") - peptides_txt_mapped_log2$mapped_protein = mapped_protein_uniprotonly - # Runs the Tukey function returning completed table - peptides_txt_mapped_log2 = subset(peptides_txt_mapped_log2,mapped_protein %in% swissprot_fasta) - protein_intensities_tukeys = get_protein_values(peptides_txt_mapped_log2,intensity_columns) - protein_intensities_tukeys[protein_intensities_tukeys == 1] <- 0 - write.table(protein_intensities_tukeys, "./tukeys_output.txt", row.names = FALSE, col.names = TRUE, quote = FALSE, sep = "\t") - -} - -map_peptides_proteins = function(peptides_in) { - #reverse sequences are blank but have a razor protein indicating that they are reverse; exclude these for now - peptides_in = subset(peptides_in,peptides_in$Proteins != "") - results_list = list() - k = 1 - for (i in 1:nrow(peptides_in)) { - protein_names = peptides_in[i,"Proteins"] - protein_names_split = unlist(strsplit(protein_names,";")) - for (j in 1:length(protein_names_split)) { - peptides_mapped_proteins = data.frame(peptides_in[i,],mapped_protein=protein_names_split[j],stringsAsFactors=FALSE) - results_list[[k]] = peptides_mapped_proteins - k = k+1 - - } - } - return(rbindlist(results_list)) -} - -get_protein_values = function(mapped_peptides_in,intensity_columns_list) { - unique_mapped_proteins_list = unique(mapped_peptides_in$mapped_protein) # Gets list of all peptides listed. - # Generates a blank data frame with clomns of Intensities and rows of Uniprots. - Tukeys_df = data.frame(mapped_protein = unique_mapped_proteins_list, stringsAsFactors = FALSE ) - for (q in intensity_columns_list) {Tukeys_df[,q] = NA} - for (i in 1:length(unique_mapped_proteins_list)) { - mapped_peptides_unique_subset = subset(mapped_peptides_in, mapped_protein == unique_mapped_proteins_list[i]) - #calculate Tukey's Biweight from library(affy); returns a single numeric - #results_list[[i]] = data.frame(Protein=unique_mapped_proteins_list[i],Peptides_per_protein=nrow(mapped_peptides_unique_subset)) - for (j in intensity_columns_list) { - #Populates with new Tukeys values. - Tukeys_df[i,j] = 2^(tukey.biweight(mapped_peptides_unique_subset[,j])) - } - } - return(Tukeys_df) -} - -args <- commandArgs(trailingOnly = TRUE) -main(args[1])
--- a/saint_preproc/tool_dependencies.xml Tue Nov 10 13:13:22 2015 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,6 +0,0 @@ -<?xml version="1.0"?> -<tool_dependency> - <set_environment version="1.0"> - <environment_variable name="INSTALL_RUN_PATH" action="set_to">$REPOSITORY_INSTALL_DIR</environment_variable> - </set_environment>--> -</tool_dependency> \ No newline at end of file