Previous changeset 3:6571324e3d2c (2015-11-10) Next changeset 5:2b631809150b (2015-11-10) |
Commit message:
Deleted selected files |
removed:
saint_preproc/SAINT_preprocessing_v5.xml saint_preproc/SAINT_preprocessing_v6.py saint_preproc/SAINT_preprocessing_v6_mq_pep.py saint_preproc/pre_process_protein_name_set.R saint_preproc/tool_dependencies.xml |
b |
diff -r 6571324e3d2c -r 378db8ea92ea saint_preproc/SAINT_preprocessing_v5.xml --- a/saint_preproc/SAINT_preprocessing_v5.xml Tue Nov 10 13:13:22 2015 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,91 +0,0 @@ -<tool id="SAINT_preprocessing_v5" name="SAINT pre-processing"> - <description></description> - <command interpreter="python"> - #if (str($type) == 'Scaffold'): - SAINT_preprocessing_v6.py $input $preybool $fasta_db $Inter_file $Prey_file - " - #for $ba in $bait - ${ba.bait1} - ${ba.assign} - ${ba.T_C} - #end for - " - $Bait_file \$INSTALL_RUN_PATH/ - #elif (str($type) == 'MaxQuant'): - SAINT_preprocessing_v6_mq_pep.py $input $preybool $fasta_db $Inter_file $Prey_file - " - #for $ba in $bait - ${ba.bait1} - ${ba.assign} - ${ba.T_C} - #end for - " - $Bait_file \$INSTALL_RUN_PATH/ - #end if - </command> - <requirements> - <requirement type="set_environment">INSTALL_RUN_PATH</requirment> - </requirments> - <inputs> - <param type="select" name="type" label="MaxQuant or Scaffold"> - <option value="MaxQuant">MaxQuant</option> - <option value="Scaffold">Scaffold</option> - </param> - <param format="dat" name="input" type="data" label="Scaffold or MaxQuant proteinGroup Output"/> - <param type="boolean" name="preybool" checked="true" label="Create Prey File"/> - <param type="data" name="fasta_db" format="fasta" label="Provide Uniprot Fasta database" /> - <repeat name="bait" title="Bait Create"> - <param name="bait1" type="text" size="100"/> - <param name="assign" type="text" size="100"/> - <param name="T_C" type="boolean" checked="true" label="Is this a Control?"/> - </repeat> - </inputs> - <outputs> - <data format="txt" name="Inter_file" label="Inter File"/> - <data format="txt" name="Prey_file" label="Prey File" /> - <data format="txt" name="Bait_file" label="Bait File" /> - </outputs> - <stdio> - <regex match="error" - source="stdout" - level="fatal" - description="Unknown error"/> - <regex match="Error: bad bait" - source="stdout" - level="fatal" - description="Error: bad bait"/> - </stdio> - - <tests> - <test> - <param name="input" value="fa_gc_content_input.fa"/> - <output name="out_file1" file="fa_gc_content_output.txt"/> - </test> - </tests> - <help> -Pre-processing: -APOSTL is able to recognize either a Scaffold "Samples Report" file (tab-delimited -txt file) or the "peptides.txt" file output in the maxquant "txt" output folder. No -modifications should be made to these files. Using the "Bait Create" tool, you can -create your "bait.txt" file. It is important that the individual bait names match the -bait names within your scaffold or maxquant output. APOSTL uses the bait file to find -the user's baits of interest. Additionally there is an option to make the prey file (Y/N). -When making a prey file, APOSTL queries uniprot in order to extract protein amino acid -lengths and gene names. This takes several minutes depending on your internet connection. -Some users may want to run SAINTexpress using the same data set while changing which baits -are considered test or control. It is useful to toggle "Make Prey" off in order to save -time by circumventing this step as the same prey file can be used for both SAINTexpress -runs. - -INPUTS: - -Scaffold file: - -- Scaffold "Samples Report" output (tab-delimited txt file) - - -Maxquant file: - -- maxquant "peptides.txt" file (tab-delimited txt file) - </help> -</tool> |
b |
diff -r 6571324e3d2c -r 378db8ea92ea saint_preproc/SAINT_preprocessing_v6.py --- a/saint_preproc/SAINT_preprocessing_v6.py Tue Nov 10 13:13:22 2015 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
b'@@ -1,245 +0,0 @@\n-#######################################################################################\r\n-# Python-code: SAINT pre-processing from Scaffold "Samples Report" output\r\n-# Author: Brent Kuenzi\r\n-#######################################################################################\r\n-# This program reads in a raw Scaffold "Samples Report" output and a user generated\r\n-# bait file and autoformats it into prey and interaction files for SAINTexpress \r\n-# analysis\r\n-#######################################################################################\r\n-import sys\r\n-import urllib2\r\n-import os.path\r\n-#######################################################################################\r\n-## REQUIRED INPUT ##\r\n-\r\n-# 1) infile: Scaffold "Samples Report" output\r\n-# 2) baitfile: SAINT formatted bait file generated in Galaxy\r\n-# 3) prey: Y or N for generating a prey file (requires internet connection)\r\n-#######################################################################################\r\n-infile = sys.argv[1] #Scaffold "Samples Report" output\r\n-prey = sys.argv[2] # Y or N\r\n-fasta_db = sys.argv[3]\r\n-tool_path = sys.argv[8]\r\n-if fasta_db == "None":\r\n- fasta_db = str(tool_path) + "/SwissProt_HUMAN_2014_08.fasta"\r\n-make_bait= sys.argv[5]\r\n-\r\n-\r\n-baits = make_bait.split()\r\n-i = 0\r\n-bait_file_tmp = open("bait.txt", "wr")\r\n-order = [] \r\n-bait_cache = []\r\n-\r\n-while i < len(baits):\r\n- if baits[i+2] == "true":\r\n- T_C = "C"\r\n- else:\r\n- T_C = "T"\r\n- line1 = baits[i] + "\\t" + baits[i+1] + "\\t" + T_C + "\\n"\r\n- q = open(infile,"r")\r\n- for line2 in q:\r\n- line2 = line2.strip()\r\n- temp = line2.split(\'\\t\')\r\n- if "Quantitative Variance" in str(temp):\r\n- if baits[i] in temp:\r\n- number_bait = temp.index(str(baits[i]))\r\n- number_bait = number_bait - 9\r\n- bait_cache.append((number_bait, str(line1)))\r\n- else:\r\n- print "Error: bad bait " + str(baits[i])\r\n- sys.exit()\r\n- else: \r\n- pass \r\n- i = i + 3\r\n-\r\n-bait_cache.sort()\r\n-for line in bait_cache:\r\n- bait_file_tmp.write(line[1]) \r\n- \r\n-bait_file_tmp.close()\r\n-baitfile = "bait.txt" \r\n-\r\n-class ReturnValue1(object):\r\n- def __init__(self, sequence, gene):\r\n- self.seqlength = sequence\r\n- self.genename = gene\r\n-class ReturnValue2(object):\r\n- def __init__(self, getdata, getproteins, getheader):\r\n- self.data = getdata\r\n- self.proteins = getproteins\r\n- self.header = getheader\r\n-\r\n-def main(scaffold_input, baitfile): \r\n- bait_check(baitfile, scaffold_input)\r\n- make_inter(scaffold_input)\r\n- if prey == \'true\':\r\n- make_prey(scaffold_input)\r\n- no_error_inter(scaffold_input)\r\n- os.rename(\'prey.txt\', sys.argv[5])\r\n- elif prey == \'false\':\r\n- if os.path.isfile(\'error proteins.txt\') == True:\r\n- no_error_inter(scaffold_input)\r\n- pass\r\n- elif prey != \'true\' or \'false\':\r\n- sys.exit("Invalid Prey Argument: Y or N")\r\n-\r\n-def get_info(uniprot_accession_in): # get aa lengths and gene name\r\n- error = open(\'error proteins.txt\', \'a+\')\r\n-# while True:\r\n-# i = 0\r\n-#\ttry: \r\n-# data = urllib2.urlopen("http://www.uniprot.org/uniprot/" + uniprot_accession_in + ".fasta")\r\n-# break\r\n-# except urllib2.HTTPError, err:\r\n-# i = i + 1\r\n-# if i == 50:\r\n-# sys.exit("More than 50 errors. Check your file or try again later.")\r\n-# if err.code == 404:\r\n-# error.write(uniprot_accession_in + \'\\t\' + "Invalid URL. Check protein" + \'\\n\')\r\n-# seqlength = \'NA\'\r\n-# genename = \'NA\'\r\n-# return ReturnValue1(seqlength, genename)\r\n-# elif err.code == 302:\r\n-# sys.exit("Request timed out. Check connection and try again.")\r\n-# else:\r\n-# sys.exit'..b' if match <= db_len:\r\n- seqlength = seqlength + len(lines[match].strip())\r\n- match = match + 1\r\n- else:\r\n- break\r\n- return ReturnValue1(seqlength, genename)\r\n- count = count + 1\r\n- \r\n-\r\n- if seqlength == 0:\r\n- error.write(uniprot_accession_in + \'\\t\' + "Uniprot not in Fasta" + \'\\n\')\r\n- error.close\r\n- seqlength = \'NA\'\r\n- genename = \'NA\'\r\n- return ReturnValue1(seqlength, genename)\r\n-\r\n-def readtab(infile):\r\n- with open(infile,\'r\') as x: # read in tab-delim text\r\n- output = []\r\n- for line in x:\r\n- line = line.strip()\r\n- temp = line.split(\'\\t\')\r\n- output.append(temp)\r\n- return output\r\n-def read_scaffold(scaffold_input): # Get data, proteins and header from scaffold output\r\n- dupes = readtab(scaffold_input)\r\n- cnt = 0\r\n- for i in dupes:\r\n- cnt += 1\r\n- if i[0] == \'#\': # finds the start of second header\r\n- header_start = cnt-1\r\n- header = dupes[header_start]\r\n- prot_start = header.index("Accession Number")\r\n- data = dupes[header_start+1:len(dupes)-2] # cut off blank line and END OF FILE\r\n- proteins = []\r\n- for i in data:\r\n- i[4] = i[4].split()[0] # removes the (+##) that sometimes is attached\r\n- for protein in data:\r\n- proteins.append(protein[prot_start])\r\n- return ReturnValue2(data, proteins, header)\r\n-def make_inter(scaffold_input):\r\n- bait = readtab(baitfile)\r\n- data = read_scaffold(scaffold_input).data\r\n- header = read_scaffold(scaffold_input).header\r\n- proteins = read_scaffold(scaffold_input).proteins\r\n- bait_index = []\r\n- for i in bait:\r\n- bait_index.append(header.index(i[0])) # Find just the baits defined in bait file\r\n- with open(\'inter.txt\', \'w\') as y:\r\n- a = 0; l=0\r\n- for bb in bait:\r\n- for lst in data:\r\n- y.write(header[bait_index[l]] + \'\\t\' + bb[1] + \'\\t\' + proteins[a] + \'\\t\' + lst[bait_index[l]] + \'\\n\')\r\n- a+=1\r\n- if a == len(proteins):\r\n- a = 0; l+=1\r\n-def make_prey(scaffold_input):\r\n- proteins = read_scaffold(scaffold_input).proteins\r\n- output_file = open("prey.txt",\'w\')\r\n- for a in proteins:\r\n- a = a.replace("\\n","") # remove \\n for input into function\r\n- a = a.replace("\\r","") # ditto for \\r\r\n- seq = get_info(a).seqlength\r\n- GN = get_info(a).genename\r\n- if seq != \'NA\':\r\n- output_file.write(a+"\\t"+str(seq)+ "\\t" + str(GN) + "\\n")\r\n- output_file.close()\r\n-def no_error_inter(scaffold_input): # remake inter file without protein errors from Uniprot\r\n- err = readtab("error proteins.txt")\r\n- bait = readtab(baitfile)\r\n- data = read_scaffold(scaffold_input).data\r\n- header = read_scaffold(scaffold_input).header\r\n- bait_index = []\r\n- for i in bait:\r\n- bait_index.append(header.index(i[0]))\r\n- proteins = read_scaffold(scaffold_input).proteins\r\n- errors = []\r\n- for e in err:\r\n- errors.append(e[0])\r\n- with open(\'inter.txt\', \'w\') as y:\r\n- l = 0; a = 0\r\n- for bb in bait:\r\n- for lst in data:\r\n- if proteins[a] not in errors:\r\n- y.write(header[bait_index[l]] + \'\\t\' + bb[1] + \'\\t\' + proteins[a] + \'\\t\' + lst[bait_index[l]] + \'\\n\')\r\n- a+=1\r\n- if a == len(proteins):\r\n- l += 1; a = 0\r\n-def bait_check(bait, scaffold_input): # check that bait names share header titles\r\n- bait_in = readtab(bait)\r\n- header = read_scaffold(scaffold_input).header\r\n- for i in bait_in:\r\n- if i[0] not in header:\r\n- sys.exit("Bait must share header titles with Scaffold output")\r\n-\r\n-if __name__ == \'__main__\':\r\n- main(infile, baitfile)\r\n-\r\n-os.rename(\'inter.txt\', sys.argv[4])\r\n-os.rename("bait.txt", sys.argv[7])\r\n' |
b |
diff -r 6571324e3d2c -r 378db8ea92ea saint_preproc/SAINT_preprocessing_v6_mq_pep.py --- a/saint_preproc/SAINT_preprocessing_v6_mq_pep.py Tue Nov 10 13:13:22 2015 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
b'@@ -1,262 +0,0 @@\n-#######################################################################################\r\n-# Python-code: SAINT pre-processing from maxquant "Samples Report" output\r\n-# Author: Brent Kuenzi\r\n-#######################################################################################\r\n-# This program reads in a raw maxquant "Samples Report" output and a user generated\r\n-# bait file and autoformats it into prey and interaction files for SAINTexpress \r\n-# analysis\r\n-#######################################################################################\r\n-import sys\r\n-import urllib2\r\n-import os\r\n-#######################################################################################\r\n-## REQUIRED INPUT ##\r\n-\r\n-# 1) infile: maxquant "Samples Report" output\r\n-# 2) baitfile: SAINT formatted bait file generated in Galaxy\r\n-# 3) prey: Y or N for generating a prey file (requires internet connection)\r\n-#######################################################################################\r\n-mq_file = sys.argv[1]\r\n-cmd = r"Rscript /home/bornea/galaxy_moffitt_dev/tools/Moffitt_Tools/bubblebeam/pre_process_protein_name_set.R " + str(mq_file) \r\n-os.system(cmd)\r\n-\r\n-infile = "./tukeys_output.txt" #maxquant "Samples Report" output\r\n-prey = sys.argv[2] # Y or N\r\n-fasta_db = sys.argv[3]\r\n-if fasta_db == "None":\r\n- fasta_db = "/home/bornea/galaxy_moffitt_dev/tools/Moffitt_Tools/bubblebeam/SwissProt_HUMAN_2014_08.fasta"\r\n-make_bait= sys.argv[6]\r\n-\r\n-def bait_create(baits, infile):\r\n- #Takes the Bait specified by the user and makes them into a Bait file and includes a check to make sure they are using valid baits.\r\n- baits = make_bait.split()\r\n- i = 0\r\n- bait_file_tmp = open("bait.txt", "wr")\r\n- order = [] \r\n- bait_cache = []\r\n- while i < len(baits):\r\n- if baits[i+2] == "true":\r\n- T_C = "C"\r\n- else:\r\n- T_C = "T"\r\n- line1 = baits[i] + "\\t" + baits[i+1] + "\\t" + T_C + "\\n"\r\n- q = open(infile,"r")\r\n- for line2 in q:\r\n- line2 = line2.replace("\\"", "")\r\n- line2 = line2.replace(r"Intensity.", "") #R coerces "-" into "." this changes them back and remove Intensity from the Bait names.\r\n- line2 = line2.replace(r".", r"-")\r\n- temp = line2.split()\r\n- if "mapped_protein" in str(temp):\r\n- #If the bait is in the original file then write to cache it if not exit.\r\n- if baits[i] in temp:\r\n- number_bait = temp.index(str(baits[i]))\r\n- number_bait = number_bait - 9\r\n- bait_cache.append((number_bait, str(line1)))\r\n- else:\r\n- print "Error: bad bait " + str(baits[i])\r\n- sys.exit()\r\n- else: \r\n- pass \r\n- i = i + 3\r\n- #Writes cache to file.\r\n- bait_cache.sort()\r\n- for line in bait_cache:\r\n- bait_file_tmp.write(line[1]) \r\n- \r\n- bait_file_tmp.close() \r\n-\r\n-\r\n-baitfile = "bait.txt"\r\n-\r\n-class ReturnValue1(object):\r\n- def __init__(self, sequence, gene):\r\n- self.seqlength = sequence\r\n- self.genename = gene\r\n-class ReturnValue2(object):\r\n- def __init__(self, getdata, getproteins, getheader):\r\n- self.data = getdata\r\n- self.proteins = getproteins\r\n- self.header = getheader\r\n-\r\n-def main(maxquant_input, make_bait): \r\n- bait_create(make_bait, infile)\r\n- baitfile = "bait.txt"\r\n- #bait_check(baitfile, maxquant_input)\r\n- make_inter(maxquant_input)\r\n- if prey == \'true\':\r\n- make_prey(maxquant_input)\r\n- no_error_inter(maxquant_input)\r\n- os.rename(\'prey.txt\', sys.argv[5])\r\n- elif prey == \'false\':\r\n- if os.path.isfile(\'error proteins.txt\') == True:\r\n- no_error_inter(maxquant_input)\r\n- pass\r\n- elif prey != \'true\' or \'false\':\r\n- sys.exit("Invalid Prey Argument: Y or N")\r\n- os.rename(\'inter.txt\', sys.argv[4])\r\n- '..b'tch]:\r\n- if match <= db_len:\r\n- seqlength = seqlength + len(lines[match].strip())\r\n- match = match + 1\r\n- else:\r\n- break\r\n- return ReturnValue1(seqlength, genename)\r\n- count = count + 1\r\n- \r\n-\r\n- if seqlength == 0:\r\n- error.write(uniprot_accession_in + \'\\t\' + "Uniprot not in Fasta" + \'\\n\')\r\n- error.close\r\n- seqlength = \'NA\'\r\n- genename = \'NA\'\r\n- return ReturnValue1(seqlength, genename)\r\n-\r\n-\r\n-def readtab(infile):\r\n- with open(infile,\'r\') as x: # read in tab-delim text\r\n- output = []\r\n- for line in x:\r\n- line = line.strip()\r\n- temp = line.split(\'\\t\')\r\n- output.append(temp)\r\n- return output\r\n-def read_maxquant(maxquant_input): # Get data, proteins and header from maxquant output\r\n- dupes = readtab(maxquant_input)\r\n- header_start = 0 \r\n- header = dupes[header_start]\r\n- for i in header:\r\n- i = i.replace(r"\\"", "")\r\n- i = i.replace(r"Intensity.", r"")\r\n- i = i.replace(r".", r"-")\r\n- data = dupes[header_start+1:len(dupes)] #cut off blank line and END OF FILE\r\n- proteins = []\r\n- for protein in data:\r\n- proteins.append(protein[0])\r\n- return ReturnValue2(data, proteins, header)\r\n-def make_inter(maxquant_input):\r\n- bait = readtab(baitfile)\r\n- data = read_maxquant(maxquant_input).data\r\n- header = read_maxquant(maxquant_input).header\r\n- proteins = read_maxquant(maxquant_input).proteins\r\n- bait_index = []\r\n- for i in bait:\r\n- bait_index.append(header.index("mapped_protein") + 1) # Find just the baits defined in bait file\r\n- with open(\'inter.txt\', \'w\') as y:\r\n- a = 0; l=0\r\n- for bb in bait:\r\n- for lst in data:\r\n- y.write(header[bait_index[l]] + \'\\t\' + bb[1] + \'\\t\' + proteins[a] + \'\\t\' + lst[bait_index[l]] + \'\\n\')\r\n- a+=1\r\n- if a == len(proteins):\r\n- a = 0; l+=1\r\n-def make_prey(maxquant_input):\r\n- proteins = read_maxquant(maxquant_input).proteins\r\n- output_file = open("prey.txt",\'w\')\r\n- for a in proteins:\r\n- a = a.replace("\\n","") # remove \\n for input into function\r\n- a = a.replace("\\r","") # ditto for \\r\r\n- seq = get_info(a).seqlength\r\n- GN = get_info(a).genename\r\n- if seq != \'NA\':\r\n- output_file.write(a+"\\t"+str(seq)+ "\\t" + str(GN) + "\\n")\r\n- output_file.close()\r\n-def no_error_inter(maxquant_input): # remake inter file without protein errors from Uniprot\r\n- err = readtab("error proteins.txt")\r\n- bait = readtab(baitfile)\r\n- data = read_maxquant(maxquant_input).data\r\n- header = read_maxquant(maxquant_input).header\r\n- header = [i.replace(r"\\"", "") for i in header]\r\n- header = [i.replace(r"Intensity.", r"") for i in header]\r\n- header = [i.replace(r".", r"-") for i in header]\r\n- print header\r\n- bait_index = []\r\n- for i in bait:\r\n- bait_index.append(header.index(i[0]))\r\n- proteins = read_maxquant(maxquant_input).proteins\r\n- errors = []\r\n- for e in err:\r\n- errors.append(e[0])\r\n- with open(\'inter.txt\', \'w\') as y:\r\n- l = 0; a = 0\r\n- for bb in bait:\r\n- for lst in data:\r\n- if proteins[a] not in errors:\r\n- y.write(header[bait_index[l]] + \'\\t\' + bb[1] + \'\\t\' + proteins[a] + \'\\t\' + lst[bait_index[l]] + \'\\n\')\r\n- a+=1\r\n- if a == len(proteins):\r\n- l += 1; a = 0\r\n-def bait_check(bait, maxquant_input): # check that bait names share header titles\r\n- bait_in = readtab(bait)\r\n- header = read_maxquant(maxquant_input).header\r\n- for i in bait_in:\r\n- if i[0] not in header:\r\n- sys.exit("Bait must share header titles with maxquant output")\r\n-\r\n-if __name__ == \'__main__\':\r\n- main(infile, make_bait)\r\n' |
b |
diff -r 6571324e3d2c -r 378db8ea92ea saint_preproc/pre_process_protein_name_set.R --- a/saint_preproc/pre_process_protein_name_set.R Tue Nov 10 13:13:22 2015 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,77 +0,0 @@ -library(data.table) -library(affy) -library(stringr) -library(mygene) -library(VennDiagram) -##### -#data -main <- function(peptides_file) { - peptides_file = read.delim(peptides_file,header=TRUE,stringsAsFactors=FALSE,fill=TRUE) - peptides_txt = peptides_file - intensity_columns = names(peptides_txt[,str_detect(names(peptides_txt),"Intensity\\.*")]) #Pulls out all lines with Intensity in them. - intensity_columns = intensity_columns[2:length(intensity_columns)] #Removes the first column that does not have a bait. - peptides_txt_mapped = as.data.frame(map_peptides_proteins(peptides_txt)) #This function as below sets every line to a 1 to 1 intensity to each possible protein. - peptides_txt_mapped$Uniprot = str_extract(peptides_txt_mapped$mapped_protein, "[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}") #Pulls out just Uniprot id from the script. - peptides_txt_mapped = subset(peptides_txt_mapped,!is.na(Uniprot)) #removes reverse sequences and any that didn't match a uniprot accession - columns_comb = c("Uniprot", intensity_columns) - peptides_mapped_intensity = subset(peptides_txt_mapped, select = columns_comb) #Subsets out only the needed cloumns for Tukeys (Uniprot IDS and baited intensities) - swissprot_fasta = scan("/home/philip/galaxy/tools/Moffitt_Tools/uniprot_names.txt",what="character") - peptides_txt_mapped_log2 = peptides_mapped_intensity - # Takes the log2 of the intensities. - for (i in intensity_columns) { - peptides_txt_mapped_log2[,i] = log2(subset(peptides_txt_mapped_log2, select = i)) - } - #get the minimum from each column while ignoring the -Inf; get the min of these mins for the global min; breaks when there's only one intensity column - global_min = min(apply(peptides_txt_mapped_log2[,2:ncol(peptides_txt_mapped_log2)],2,function(x) { - min(x[x != -Inf]) - })) - peptides_txt_mapped_log2[peptides_txt_mapped_log2 == -Inf] <- 0 - #uniprot accessions WITHOUT isoforms; it looks like only contaminants contain isoforms anyways - mapped_protein_uniprotonly = str_extract(peptides_txt_mapped_log2$Uniprot,"[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}") - mapped_protein_uniprot_accession = str_extract(peptides_txt_mapped_log2$Uniprot,"[OPQ][0-9][A-Z0-9]{3}[0-9](-[0-9]+)?|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}(-[0-9]+)?|[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}") - peptides_txt_mapped_log2$mapped_protein = mapped_protein_uniprotonly - # Runs the Tukey function returning completed table - peptides_txt_mapped_log2 = subset(peptides_txt_mapped_log2,mapped_protein %in% swissprot_fasta) - protein_intensities_tukeys = get_protein_values(peptides_txt_mapped_log2,intensity_columns) - protein_intensities_tukeys[protein_intensities_tukeys == 1] <- 0 - write.table(protein_intensities_tukeys, "./tukeys_output.txt", row.names = FALSE, col.names = TRUE, quote = FALSE, sep = "\t") - -} - -map_peptides_proteins = function(peptides_in) { - #reverse sequences are blank but have a razor protein indicating that they are reverse; exclude these for now - peptides_in = subset(peptides_in,peptides_in$Proteins != "") - results_list = list() - k = 1 - for (i in 1:nrow(peptides_in)) { - protein_names = peptides_in[i,"Proteins"] - protein_names_split = unlist(strsplit(protein_names,";")) - for (j in 1:length(protein_names_split)) { - peptides_mapped_proteins = data.frame(peptides_in[i,],mapped_protein=protein_names_split[j],stringsAsFactors=FALSE) - results_list[[k]] = peptides_mapped_proteins - k = k+1 - - } - } - return(rbindlist(results_list)) -} - -get_protein_values = function(mapped_peptides_in,intensity_columns_list) { - unique_mapped_proteins_list = unique(mapped_peptides_in$mapped_protein) # Gets list of all peptides listed. - # Generates a blank data frame with clomns of Intensities and rows of Uniprots. - Tukeys_df = data.frame(mapped_protein = unique_mapped_proteins_list, stringsAsFactors = FALSE ) - for (q in intensity_columns_list) {Tukeys_df[,q] = NA} - for (i in 1:length(unique_mapped_proteins_list)) { - mapped_peptides_unique_subset = subset(mapped_peptides_in, mapped_protein == unique_mapped_proteins_list[i]) - #calculate Tukey's Biweight from library(affy); returns a single numeric - #results_list[[i]] = data.frame(Protein=unique_mapped_proteins_list[i],Peptides_per_protein=nrow(mapped_peptides_unique_subset)) - for (j in intensity_columns_list) { - #Populates with new Tukeys values. - Tukeys_df[i,j] = 2^(tukey.biweight(mapped_peptides_unique_subset[,j])) - } - } - return(Tukeys_df) -} - -args <- commandArgs(trailingOnly = TRUE) -main(args[1]) |
b |
diff -r 6571324e3d2c -r 378db8ea92ea saint_preproc/tool_dependencies.xml --- a/saint_preproc/tool_dependencies.xml Tue Nov 10 13:13:22 2015 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,6 +0,0 @@ -<?xml version="1.0"?> -<tool_dependency> - <set_environment version="1.0"> - <environment_variable name="INSTALL_RUN_PATH" action="set_to">$REPOSITORY_INSTALL_DIR</environment_variable> - </set_environment>--> -</tool_dependency> \ No newline at end of file |