# HG changeset patch
# User bornea
# Date 1447179231 18000
# Node ID 378db8ea92eadfed98241b23ce228b5e37f6717b
# Parent 6571324e3d2caa63711132299b20bfe709bb552c
Deleted selected files
diff -r 6571324e3d2c -r 378db8ea92ea saint_preproc/SAINT_preprocessing_v5.xml
--- a/saint_preproc/SAINT_preprocessing_v5.xml Tue Nov 10 13:13:22 2015 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,91 +0,0 @@
-
-
-
- #if (str($type) == 'Scaffold'):
- SAINT_preprocessing_v6.py $input $preybool $fasta_db $Inter_file $Prey_file
- "
- #for $ba in $bait
- ${ba.bait1}
- ${ba.assign}
- ${ba.T_C}
- #end for
- "
- $Bait_file \$INSTALL_RUN_PATH/
- #elif (str($type) == 'MaxQuant'):
- SAINT_preprocessing_v6_mq_pep.py $input $preybool $fasta_db $Inter_file $Prey_file
- "
- #for $ba in $bait
- ${ba.bait1}
- ${ba.assign}
- ${ba.T_C}
- #end for
- "
- $Bait_file \$INSTALL_RUN_PATH/
- #end if
-
-
- INSTALL_RUN_PATH
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-Pre-processing:
-APOSTL is able to recognize either a Scaffold "Samples Report" file (tab-delimited
-txt file) or the "peptides.txt" file output in the maxquant "txt" output folder. No
-modifications should be made to these files. Using the "Bait Create" tool, you can
-create your "bait.txt" file. It is important that the individual bait names match the
-bait names within your scaffold or maxquant output. APOSTL uses the bait file to find
-the user's baits of interest. Additionally there is an option to make the prey file (Y/N).
-When making a prey file, APOSTL queries uniprot in order to extract protein amino acid
-lengths and gene names. This takes several minutes depending on your internet connection.
-Some users may want to run SAINTexpress using the same data set while changing which baits
-are considered test or control. It is useful to toggle "Make Prey" off in order to save
-time by circumventing this step as the same prey file can be used for both SAINTexpress
-runs.
-
-INPUTS:
-
-Scaffold file:
-
-- Scaffold "Samples Report" output (tab-delimited txt file)
-
-
-Maxquant file:
-
-- maxquant "peptides.txt" file (tab-delimited txt file)
-
-
diff -r 6571324e3d2c -r 378db8ea92ea saint_preproc/SAINT_preprocessing_v6.py
--- a/saint_preproc/SAINT_preprocessing_v6.py Tue Nov 10 13:13:22 2015 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,245 +0,0 @@
-#######################################################################################
-# Python-code: SAINT pre-processing from Scaffold "Samples Report" output
-# Author: Brent Kuenzi
-#######################################################################################
-# This program reads in a raw Scaffold "Samples Report" output and a user generated
-# bait file and autoformats it into prey and interaction files for SAINTexpress
-# analysis
-#######################################################################################
-import sys
-import urllib2
-import os.path
-#######################################################################################
-## REQUIRED INPUT ##
-
-# 1) infile: Scaffold "Samples Report" output
-# 2) baitfile: SAINT formatted bait file generated in Galaxy
-# 3) prey: Y or N for generating a prey file (requires internet connection)
-#######################################################################################
-infile = sys.argv[1] #Scaffold "Samples Report" output
-prey = sys.argv[2] # Y or N
-fasta_db = sys.argv[3]
-tool_path = sys.argv[8]
-if fasta_db == "None":
- fasta_db = str(tool_path) + "/SwissProt_HUMAN_2014_08.fasta"
-make_bait= sys.argv[5]
-
-
-baits = make_bait.split()
-i = 0
-bait_file_tmp = open("bait.txt", "wr")
-order = []
-bait_cache = []
-
-while i < len(baits):
- if baits[i+2] == "true":
- T_C = "C"
- else:
- T_C = "T"
- line1 = baits[i] + "\t" + baits[i+1] + "\t" + T_C + "\n"
- q = open(infile,"r")
- for line2 in q:
- line2 = line2.strip()
- temp = line2.split('\t')
- if "Quantitative Variance" in str(temp):
- if baits[i] in temp:
- number_bait = temp.index(str(baits[i]))
- number_bait = number_bait - 9
- bait_cache.append((number_bait, str(line1)))
- else:
- print "Error: bad bait " + str(baits[i])
- sys.exit()
- else:
- pass
- i = i + 3
-
-bait_cache.sort()
-for line in bait_cache:
- bait_file_tmp.write(line[1])
-
-bait_file_tmp.close()
-baitfile = "bait.txt"
-
-class ReturnValue1(object):
- def __init__(self, sequence, gene):
- self.seqlength = sequence
- self.genename = gene
-class ReturnValue2(object):
- def __init__(self, getdata, getproteins, getheader):
- self.data = getdata
- self.proteins = getproteins
- self.header = getheader
-
-def main(scaffold_input, baitfile):
- bait_check(baitfile, scaffold_input)
- make_inter(scaffold_input)
- if prey == 'true':
- make_prey(scaffold_input)
- no_error_inter(scaffold_input)
- os.rename('prey.txt', sys.argv[5])
- elif prey == 'false':
- if os.path.isfile('error proteins.txt') == True:
- no_error_inter(scaffold_input)
- pass
- elif prey != 'true' or 'false':
- sys.exit("Invalid Prey Argument: Y or N")
-
-def get_info(uniprot_accession_in): # get aa lengths and gene name
- error = open('error proteins.txt', 'a+')
-# while True:
-# i = 0
-# try:
-# data = urllib2.urlopen("http://www.uniprot.org/uniprot/" + uniprot_accession_in + ".fasta")
-# break
-# except urllib2.HTTPError, err:
-# i = i + 1
-# if i == 50:
-# sys.exit("More than 50 errors. Check your file or try again later.")
-# if err.code == 404:
-# error.write(uniprot_accession_in + '\t' + "Invalid URL. Check protein" + '\n')
-# seqlength = 'NA'
-# genename = 'NA'
-# return ReturnValue1(seqlength, genename)
-# elif err.code == 302:
-# sys.exit("Request timed out. Check connection and try again.")
-# else:
-# sys.exit("Uniprot had some other error")
-# lines = data.readlines()
-# if lines == []:
-# error.write(uniprot_accession_in + '\t' + "Blank Fasta" + '\n')
-# error.close
-# seqlength = 'NA'
-# genename = 'NA'
-# return ReturnValue1(seqlength, genename)
-# if lines != []:
-# seqlength = 0
-# header = lines[0]
-# for line in lines[1:]:
-# line = line.replace("\n","") # strip \n or else it gets counted in the length
-# seqlength += len(line)
-# if 'GN=' in header:
-# lst = header.split('GN=')
-# lst2 = lst[1].split(' ')
-# genename = lst2[0]
-# error.close
-# return ReturnValue1(seqlength, genename)
-# if 'GN=' not in header:
-# genename = 'NA'
-# error.close
-# return ReturnValue1(seqlength, genename)
- data = open(fasta_db,'r')
- lines = data.readlines()
- db_len = len(lines)
- seqlength = 0
- count = 0
- for i in lines:
- if ">sp" in i:
- if uniprot_accession_in == i.split("|")[1]:
- match = count+1
- if 'GN=' in i:
- lst = i.split('GN=')
- lst2 = lst[1].split(' ')
- genename = lst2[0]
- if 'GN=' not in i:
- genename = 'NA'
- while ">sp" not in lines[match]:
- if match <= db_len:
- seqlength = seqlength + len(lines[match].strip())
- match = match + 1
- else:
- break
- return ReturnValue1(seqlength, genename)
- count = count + 1
-
-
- if seqlength == 0:
- error.write(uniprot_accession_in + '\t' + "Uniprot not in Fasta" + '\n')
- error.close
- seqlength = 'NA'
- genename = 'NA'
- return ReturnValue1(seqlength, genename)
-
-def readtab(infile):
- with open(infile,'r') as x: # read in tab-delim text
- output = []
- for line in x:
- line = line.strip()
- temp = line.split('\t')
- output.append(temp)
- return output
-def read_scaffold(scaffold_input): # Get data, proteins and header from scaffold output
- dupes = readtab(scaffold_input)
- cnt = 0
- for i in dupes:
- cnt += 1
- if i[0] == '#': # finds the start of second header
- header_start = cnt-1
- header = dupes[header_start]
- prot_start = header.index("Accession Number")
- data = dupes[header_start+1:len(dupes)-2] # cut off blank line and END OF FILE
- proteins = []
- for i in data:
- i[4] = i[4].split()[0] # removes the (+##) that sometimes is attached
- for protein in data:
- proteins.append(protein[prot_start])
- return ReturnValue2(data, proteins, header)
-def make_inter(scaffold_input):
- bait = readtab(baitfile)
- data = read_scaffold(scaffold_input).data
- header = read_scaffold(scaffold_input).header
- proteins = read_scaffold(scaffold_input).proteins
- bait_index = []
- for i in bait:
- bait_index.append(header.index(i[0])) # Find just the baits defined in bait file
- with open('inter.txt', 'w') as y:
- a = 0; l=0
- for bb in bait:
- for lst in data:
- y.write(header[bait_index[l]] + '\t' + bb[1] + '\t' + proteins[a] + '\t' + lst[bait_index[l]] + '\n')
- a+=1
- if a == len(proteins):
- a = 0; l+=1
-def make_prey(scaffold_input):
- proteins = read_scaffold(scaffold_input).proteins
- output_file = open("prey.txt",'w')
- for a in proteins:
- a = a.replace("\n","") # remove \n for input into function
- a = a.replace("\r","") # ditto for \r
- seq = get_info(a).seqlength
- GN = get_info(a).genename
- if seq != 'NA':
- output_file.write(a+"\t"+str(seq)+ "\t" + str(GN) + "\n")
- output_file.close()
-def no_error_inter(scaffold_input): # remake inter file without protein errors from Uniprot
- err = readtab("error proteins.txt")
- bait = readtab(baitfile)
- data = read_scaffold(scaffold_input).data
- header = read_scaffold(scaffold_input).header
- bait_index = []
- for i in bait:
- bait_index.append(header.index(i[0]))
- proteins = read_scaffold(scaffold_input).proteins
- errors = []
- for e in err:
- errors.append(e[0])
- with open('inter.txt', 'w') as y:
- l = 0; a = 0
- for bb in bait:
- for lst in data:
- if proteins[a] not in errors:
- y.write(header[bait_index[l]] + '\t' + bb[1] + '\t' + proteins[a] + '\t' + lst[bait_index[l]] + '\n')
- a+=1
- if a == len(proteins):
- l += 1; a = 0
-def bait_check(bait, scaffold_input): # check that bait names share header titles
- bait_in = readtab(bait)
- header = read_scaffold(scaffold_input).header
- for i in bait_in:
- if i[0] not in header:
- sys.exit("Bait must share header titles with Scaffold output")
-
-if __name__ == '__main__':
- main(infile, baitfile)
-
-os.rename('inter.txt', sys.argv[4])
-os.rename("bait.txt", sys.argv[7])
diff -r 6571324e3d2c -r 378db8ea92ea saint_preproc/SAINT_preprocessing_v6_mq_pep.py
--- a/saint_preproc/SAINT_preprocessing_v6_mq_pep.py Tue Nov 10 13:13:22 2015 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,262 +0,0 @@
-#######################################################################################
-# Python-code: SAINT pre-processing from maxquant "Samples Report" output
-# Author: Brent Kuenzi
-#######################################################################################
-# This program reads in a raw maxquant "Samples Report" output and a user generated
-# bait file and autoformats it into prey and interaction files for SAINTexpress
-# analysis
-#######################################################################################
-import sys
-import urllib2
-import os
-#######################################################################################
-## REQUIRED INPUT ##
-
-# 1) infile: maxquant "Samples Report" output
-# 2) baitfile: SAINT formatted bait file generated in Galaxy
-# 3) prey: Y or N for generating a prey file (requires internet connection)
-#######################################################################################
-mq_file = sys.argv[1]
-cmd = r"Rscript /home/bornea/galaxy_moffitt_dev/tools/Moffitt_Tools/bubblebeam/pre_process_protein_name_set.R " + str(mq_file)
-os.system(cmd)
-
-infile = "./tukeys_output.txt" #maxquant "Samples Report" output
-prey = sys.argv[2] # Y or N
-fasta_db = sys.argv[3]
-if fasta_db == "None":
- fasta_db = "/home/bornea/galaxy_moffitt_dev/tools/Moffitt_Tools/bubblebeam/SwissProt_HUMAN_2014_08.fasta"
-make_bait= sys.argv[6]
-
-def bait_create(baits, infile):
- #Takes the Bait specified by the user and makes them into a Bait file and includes a check to make sure they are using valid baits.
- baits = make_bait.split()
- i = 0
- bait_file_tmp = open("bait.txt", "wr")
- order = []
- bait_cache = []
- while i < len(baits):
- if baits[i+2] == "true":
- T_C = "C"
- else:
- T_C = "T"
- line1 = baits[i] + "\t" + baits[i+1] + "\t" + T_C + "\n"
- q = open(infile,"r")
- for line2 in q:
- line2 = line2.replace("\"", "")
- line2 = line2.replace(r"Intensity.", "") #R coerces "-" into "." this changes them back and remove Intensity from the Bait names.
- line2 = line2.replace(r".", r"-")
- temp = line2.split()
- if "mapped_protein" in str(temp):
- #If the bait is in the original file then write to cache it if not exit.
- if baits[i] in temp:
- number_bait = temp.index(str(baits[i]))
- number_bait = number_bait - 9
- bait_cache.append((number_bait, str(line1)))
- else:
- print "Error: bad bait " + str(baits[i])
- sys.exit()
- else:
- pass
- i = i + 3
- #Writes cache to file.
- bait_cache.sort()
- for line in bait_cache:
- bait_file_tmp.write(line[1])
-
- bait_file_tmp.close()
-
-
-baitfile = "bait.txt"
-
-class ReturnValue1(object):
- def __init__(self, sequence, gene):
- self.seqlength = sequence
- self.genename = gene
-class ReturnValue2(object):
- def __init__(self, getdata, getproteins, getheader):
- self.data = getdata
- self.proteins = getproteins
- self.header = getheader
-
-def main(maxquant_input, make_bait):
- bait_create(make_bait, infile)
- baitfile = "bait.txt"
- #bait_check(baitfile, maxquant_input)
- make_inter(maxquant_input)
- if prey == 'true':
- make_prey(maxquant_input)
- no_error_inter(maxquant_input)
- os.rename('prey.txt', sys.argv[5])
- elif prey == 'false':
- if os.path.isfile('error proteins.txt') == True:
- no_error_inter(maxquant_input)
- pass
- elif prey != 'true' or 'false':
- sys.exit("Invalid Prey Argument: Y or N")
- os.rename('inter.txt', sys.argv[4])
- os.rename("bait.txt", sys.argv[7])
-
-
-def get_info(uniprot_accession_in): # get aa lengths and gene name
- error = open('error proteins.txt', 'a+')
-# while True:
-# i = 0
-# try:
-# data = urllib2.urlopen("http://www.uniprot.org/uniprot/" + uniprot_accession_in + ".fasta")
-# break
-# except urllib2.HTTPError, err:
-# i = i + 1
-# if i == 50:
-# sys.exit("More than 50 errors. Check your file or try again later.")
-# if err.code == 404:
-# error.write(uniprot_accession_in + '\t' + "Invalid URL. Check protein" + '\n')
-# seqlength = 'NA'
-# genename = 'NA'
-# return ReturnValue1(seqlength, genename)
-# elif err.code == 302:
-# sys.exit("Request timed out. Check connection and try again.")
-# else:
-# sys.exit("Uniprot had some other error")
-#
-#
-# if lines == []:
-# error.write(uniprot_accession_in + '\t' + "Blank Fasta" + '\n')
-# error.close
-# if lines == []:
-# error.write(uniprot_accession_in + '\t' + "Blank Fasta" + '\n')
-# error.close
-# seqlength = 'NA'
-# genename = 'NA'
-# return ReturnValue1(seqlength, genename)
-# if lines != []:
-# seqlength = 0
-# header = lines[0]
-# for line in lines[1:]:
-# line = line.replace("\n","") # strip \n or else it gets counted in the length
-# seqlength += len(line)
-# if 'GN=' in header:
-# lst = header.split('GN=')
-# lst2 = lst[1].split(' ')
-# genename = lst2[0]
-# error.close
-# return ReturnValue1(seqlength, genename)
-# if 'GN=' not in header:
-# genename = 'NA'
-# error.close
-# return ReturnValue1(seqlength, genename)
-
- data = open(fasta_db,'r')
- lines = data.readlines()
- db_len = len(lines)
- seqlength = 0
- count = 0
- for i in lines:
- if ">sp" in i:
- if uniprot_accession_in == i.split("|")[1]:
- match = count+1
- if 'GN=' in i:
- lst = i.split('GN=')
- lst2 = lst[1].split(' ')
- genename = lst2[0]
- if 'GN=' not in i:
- genename = 'NA'
- while ">sp" not in lines[match]:
- if match <= db_len:
- seqlength = seqlength + len(lines[match].strip())
- match = match + 1
- else:
- break
- return ReturnValue1(seqlength, genename)
- count = count + 1
-
-
- if seqlength == 0:
- error.write(uniprot_accession_in + '\t' + "Uniprot not in Fasta" + '\n')
- error.close
- seqlength = 'NA'
- genename = 'NA'
- return ReturnValue1(seqlength, genename)
-
-
-def readtab(infile):
- with open(infile,'r') as x: # read in tab-delim text
- output = []
- for line in x:
- line = line.strip()
- temp = line.split('\t')
- output.append(temp)
- return output
-def read_maxquant(maxquant_input): # Get data, proteins and header from maxquant output
- dupes = readtab(maxquant_input)
- header_start = 0
- header = dupes[header_start]
- for i in header:
- i = i.replace(r"\"", "")
- i = i.replace(r"Intensity.", r"")
- i = i.replace(r".", r"-")
- data = dupes[header_start+1:len(dupes)] #cut off blank line and END OF FILE
- proteins = []
- for protein in data:
- proteins.append(protein[0])
- return ReturnValue2(data, proteins, header)
-def make_inter(maxquant_input):
- bait = readtab(baitfile)
- data = read_maxquant(maxquant_input).data
- header = read_maxquant(maxquant_input).header
- proteins = read_maxquant(maxquant_input).proteins
- bait_index = []
- for i in bait:
- bait_index.append(header.index("mapped_protein") + 1) # Find just the baits defined in bait file
- with open('inter.txt', 'w') as y:
- a = 0; l=0
- for bb in bait:
- for lst in data:
- y.write(header[bait_index[l]] + '\t' + bb[1] + '\t' + proteins[a] + '\t' + lst[bait_index[l]] + '\n')
- a+=1
- if a == len(proteins):
- a = 0; l+=1
-def make_prey(maxquant_input):
- proteins = read_maxquant(maxquant_input).proteins
- output_file = open("prey.txt",'w')
- for a in proteins:
- a = a.replace("\n","") # remove \n for input into function
- a = a.replace("\r","") # ditto for \r
- seq = get_info(a).seqlength
- GN = get_info(a).genename
- if seq != 'NA':
- output_file.write(a+"\t"+str(seq)+ "\t" + str(GN) + "\n")
- output_file.close()
-def no_error_inter(maxquant_input): # remake inter file without protein errors from Uniprot
- err = readtab("error proteins.txt")
- bait = readtab(baitfile)
- data = read_maxquant(maxquant_input).data
- header = read_maxquant(maxquant_input).header
- header = [i.replace(r"\"", "") for i in header]
- header = [i.replace(r"Intensity.", r"") for i in header]
- header = [i.replace(r".", r"-") for i in header]
- print header
- bait_index = []
- for i in bait:
- bait_index.append(header.index(i[0]))
- proteins = read_maxquant(maxquant_input).proteins
- errors = []
- for e in err:
- errors.append(e[0])
- with open('inter.txt', 'w') as y:
- l = 0; a = 0
- for bb in bait:
- for lst in data:
- if proteins[a] not in errors:
- y.write(header[bait_index[l]] + '\t' + bb[1] + '\t' + proteins[a] + '\t' + lst[bait_index[l]] + '\n')
- a+=1
- if a == len(proteins):
- l += 1; a = 0
-def bait_check(bait, maxquant_input): # check that bait names share header titles
- bait_in = readtab(bait)
- header = read_maxquant(maxquant_input).header
- for i in bait_in:
- if i[0] not in header:
- sys.exit("Bait must share header titles with maxquant output")
-
-if __name__ == '__main__':
- main(infile, make_bait)
diff -r 6571324e3d2c -r 378db8ea92ea saint_preproc/pre_process_protein_name_set.R
--- a/saint_preproc/pre_process_protein_name_set.R Tue Nov 10 13:13:22 2015 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,77 +0,0 @@
-library(data.table)
-library(affy)
-library(stringr)
-library(mygene)
-library(VennDiagram)
-#####
-#data
-main <- function(peptides_file) {
- peptides_file = read.delim(peptides_file,header=TRUE,stringsAsFactors=FALSE,fill=TRUE)
- peptides_txt = peptides_file
- intensity_columns = names(peptides_txt[,str_detect(names(peptides_txt),"Intensity\\.*")]) #Pulls out all lines with Intensity in them.
- intensity_columns = intensity_columns[2:length(intensity_columns)] #Removes the first column that does not have a bait.
- peptides_txt_mapped = as.data.frame(map_peptides_proteins(peptides_txt)) #This function as below sets every line to a 1 to 1 intensity to each possible protein.
- peptides_txt_mapped$Uniprot = str_extract(peptides_txt_mapped$mapped_protein, "[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}") #Pulls out just Uniprot id from the script.
- peptides_txt_mapped = subset(peptides_txt_mapped,!is.na(Uniprot)) #removes reverse sequences and any that didn't match a uniprot accession
- columns_comb = c("Uniprot", intensity_columns)
- peptides_mapped_intensity = subset(peptides_txt_mapped, select = columns_comb) #Subsets out only the needed cloumns for Tukeys (Uniprot IDS and baited intensities)
- swissprot_fasta = scan("/home/philip/galaxy/tools/Moffitt_Tools/uniprot_names.txt",what="character")
- peptides_txt_mapped_log2 = peptides_mapped_intensity
- # Takes the log2 of the intensities.
- for (i in intensity_columns) {
- peptides_txt_mapped_log2[,i] = log2(subset(peptides_txt_mapped_log2, select = i))
- }
- #get the minimum from each column while ignoring the -Inf; get the min of these mins for the global min; breaks when there's only one intensity column
- global_min = min(apply(peptides_txt_mapped_log2[,2:ncol(peptides_txt_mapped_log2)],2,function(x) {
- min(x[x != -Inf])
- }))
- peptides_txt_mapped_log2[peptides_txt_mapped_log2 == -Inf] <- 0
- #uniprot accessions WITHOUT isoforms; it looks like only contaminants contain isoforms anyways
- mapped_protein_uniprotonly = str_extract(peptides_txt_mapped_log2$Uniprot,"[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}")
- mapped_protein_uniprot_accession = str_extract(peptides_txt_mapped_log2$Uniprot,"[OPQ][0-9][A-Z0-9]{3}[0-9](-[0-9]+)?|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}(-[0-9]+)?|[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}")
- peptides_txt_mapped_log2$mapped_protein = mapped_protein_uniprotonly
- # Runs the Tukey function returning completed table
- peptides_txt_mapped_log2 = subset(peptides_txt_mapped_log2,mapped_protein %in% swissprot_fasta)
- protein_intensities_tukeys = get_protein_values(peptides_txt_mapped_log2,intensity_columns)
- protein_intensities_tukeys[protein_intensities_tukeys == 1] <- 0
- write.table(protein_intensities_tukeys, "./tukeys_output.txt", row.names = FALSE, col.names = TRUE, quote = FALSE, sep = "\t")
-
-}
-
-map_peptides_proteins = function(peptides_in) {
- #reverse sequences are blank but have a razor protein indicating that they are reverse; exclude these for now
- peptides_in = subset(peptides_in,peptides_in$Proteins != "")
- results_list = list()
- k = 1
- for (i in 1:nrow(peptides_in)) {
- protein_names = peptides_in[i,"Proteins"]
- protein_names_split = unlist(strsplit(protein_names,";"))
- for (j in 1:length(protein_names_split)) {
- peptides_mapped_proteins = data.frame(peptides_in[i,],mapped_protein=protein_names_split[j],stringsAsFactors=FALSE)
- results_list[[k]] = peptides_mapped_proteins
- k = k+1
-
- }
- }
- return(rbindlist(results_list))
-}
-
-get_protein_values = function(mapped_peptides_in,intensity_columns_list) {
- unique_mapped_proteins_list = unique(mapped_peptides_in$mapped_protein) # Gets list of all peptides listed.
- # Generates a blank data frame with clomns of Intensities and rows of Uniprots.
- Tukeys_df = data.frame(mapped_protein = unique_mapped_proteins_list, stringsAsFactors = FALSE )
- for (q in intensity_columns_list) {Tukeys_df[,q] = NA}
- for (i in 1:length(unique_mapped_proteins_list)) {
- mapped_peptides_unique_subset = subset(mapped_peptides_in, mapped_protein == unique_mapped_proteins_list[i])
- #calculate Tukey's Biweight from library(affy); returns a single numeric
- #results_list[[i]] = data.frame(Protein=unique_mapped_proteins_list[i],Peptides_per_protein=nrow(mapped_peptides_unique_subset))
- for (j in intensity_columns_list) {
- #Populates with new Tukeys values.
- Tukeys_df[i,j] = 2^(tukey.biweight(mapped_peptides_unique_subset[,j]))
- }
- }
- return(Tukeys_df)
-}
-
-args <- commandArgs(trailingOnly = TRUE)
-main(args[1])
diff -r 6571324e3d2c -r 378db8ea92ea saint_preproc/tool_dependencies.xml
--- a/saint_preproc/tool_dependencies.xml Tue Nov 10 13:13:22 2015 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,6 +0,0 @@
-
-
-
- $REPOSITORY_INSTALL_DIR
- -->
-
\ No newline at end of file