Mercurial > repos > proteore > filter_keywords_values
changeset 0:6a45ccfc0e4c draft
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
author | proteore |
---|---|
date | Sun, 26 Nov 2017 18:36:43 -0500 |
parents | |
children | d29e469b6b20 |
files | README.rst filter_kw_val.py filter_kw_val.xml test-data/UnipIDs.txt test-data/filter_keywords_values_output.txt test-data/filter_keywords_values_removed.txt |
diffstat | 6 files changed, 494 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.rst Sun Nov 26 18:36:43 2017 -0500 @@ -0,0 +1,62 @@ +Wrapper for Filter out keywords and/or numerical values tool +============================================================ + +**Authors** + +T.P. Lien Nguyen, Florence Combes, Yves Vandenbrouck CEA, INSERM, CNRS, Grenoble-Alpes University, BIG Institute, FR +Sandra Dérozier, Olivier Rué, Christophe Caron, Valentin Loux INRA, Paris-Saclay University, MAIAGE Unit, Migale Bioinformatics platform + +This work has been partially funded through the French National Agency for Research (ANR) IFB project. + +Contact support@proteore.org for any questions or concerns about the Galaxy implementation of this tool. + +------------------------------------------------------------ + +This tool allows to remove unneeded data (e.g. contaminants, non-significant values) from a proteomics results file (e.g. MaxQuant or Proline output). + +**For each row, if there are more than one protein IDs/protein names/gene names, only the first one will be considered in the output** + +**Filter the file by keywords** + +Several options can be used. For each option, you can fill in the field or upload a file which contains the keywords. + +- If you choose to fill in the field, the keywords should be separated by ":", for example: A8K2U0:Q5TA79:O43175 + +- If you choose to upload a file in a text format in which each line is a keyword, for example: + + REV + + TRYP_PIG + + ALDOA_RABBIT + +**The line that contains these keywords will be eliminated from input file.** + +**Keywords search can be applied by performing either exact match or partial one by using the following option** + +- If you choose **Yes**, only the fields that contains exactly the same content will be removed. + +- If you choose **No**, all the fields containing the keyword will be removed. + +For example: + +**Yes** option (exact match) selected using the keyword "kinase": only lines which contain exactly "kinase" is removed. + +**No** option (partial match) for "kinase": not only lines which contain "kinase" but also lines with "alpha-kinase" (and so on) are removed. + +**Filter the file by values** + +You can choose to use one or more options (e.g. to filter out peptides of low intensity value, by q-value, etc.). + +* For each option, you can choose between "=", ">", ">=", "<" and "<=", then enter the value to filter and specify the column to apply that option. + +**Output** + +The tool will produce 2 output files. + +* A text file containing the resulting filtered input file. + +* A text file containing the rows removed from the input file. + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filter_kw_val.py Sun Nov 26 18:36:43 2017 -0500 @@ -0,0 +1,180 @@ +import argparse +import re + + +def options(): + parser = argparse.ArgumentParser() + parser.add_argument("-i", "--input", help="Input file", required=True) + parser.add_argument("-m", "--match", help="Exact macth") + parser.add_argument("--kw", nargs="+", action="append", help="") # + parser.add_argument("--kw_file", nargs="+", action="append", help="") + parser.add_argument("--value", nargs="+", action="append", help="") + parser.add_argument("-o", "--output", default="output.txt") + parser.add_argument("--trash_file", default="trash_MQfilter.txt") + + args = parser.parse_args() + + filters(args) + + # python filter2.py -i "/projet/galaxydev/galaxy/tools/proteore_uc1/proteinGroups_Maud.txt" --protein_IDs "A2A288:A8K2U0" --peptides 2 "=" -o "test-data/output_MQfilter.txt" + + +def isnumber(format, n): + float_format = re.compile("^[\-]?[1-9][0-9]*\.?[0-9]+$") + int_format = re.compile("^[\-]?[1-9][0-9]*$") + test = "" + if format == "int": + test = re.match(int_format, n) + elif format == "float": + test = re.match(float_format, n) + if test: + return True + else: + return False + +def filters(args): + MQfilename = args.input.split(",")[0] + header = args.input.split(",")[1] + MQfile = readMQ(MQfilename) + results = [MQfile, None] + + if args.kw: + keywords = args.kw + for k in keywords: + results = filter_keyword(results[0], header, results[1], k[0], k[1], k[2]) + if args.kw_file: + key_files = args.kw_file + for kf in key_files: + ids = readOption(kf[0]) + results = filter_keyword(results[0], header, results[1], ids, kf[1], kf[2]) + if args.value: + for v in args.value: + if isnumber("float", v[0]): + results = filter_value(results[0], header, results[1], v[0], v[1], v[2]) + else: + raise ValueError("Please enter a number in filter by value") + + # Write results to output + output = open(args.output, "w") + output.write("".join(results[0])) + output.close() + + # Write deleted lines to trash_file + trash = open(args.trash_file, "w") + #print("".join(results[1])) + trash.write("".join(results[1])) + trash.close() + +def readOption(filename): + f = open(filename, "r") + file = f.read() + #print(file) + filter_list = file.split("\n") + #print(filter_list) + filters = "" + for i in filter_list: + filters += i + ":" + filters = filters[:-1] + #print(filters) + return filters + +def readMQ(MQfilename): + # Read MQ file + mqfile = open(MQfilename, "r") + mq = mqfile.readlines() + # Remove empty lines (contain only space or new line or "") + [mq.remove(blank) for blank in mq if blank.isspace() or blank == ""] + return mq + +def filter_keyword(MQfile, header, filtered_lines, ids, ncol, match): + mq = MQfile + if isnumber("int", ncol.replace("c", "")): + id_index = int(ncol.replace("c", "")) - 1 #columns.index("Majority protein IDs") + else: + raise ValueError("Please specify the column where you would like to apply the filter with valid format") + + ids = ids.upper().split(":") + [ids.remove(blank) for blank in ids if blank.isspace() or blank == ""] + + if header == "true": + header = mq[0] + content = mq[1:] + else: + header = "" + content = mq[:] + + if not filtered_lines: # In case there is already some filtered lines from other filters + filtered_lines = [] + if header != "": + filtered_lines.append(header) + + for line in content: + id_inline = line.split("\t")[id_index].replace('"', "").split(";") + one_id_line = line.replace(line.split("\t")[id_index], id_inline[0]) # Take only first IDs + + if match != "false": + # Filter protein IDs + if any (pid.upper() in ids for pid in id_inline): + #ids = prot_ids.split(":") + #print(prot_ids.split(":")) + #if prot_id in ids: + filtered_lines.append(one_id_line) + mq.remove(line) + else: + mq[mq.index(line)] = one_id_line + else: + if any (ft in pid.upper() for pid in id_inline for ft in ids): + filtered_lines.append(one_id_line) + mq.remove(line) + else: + mq[mq.index(line)] = one_id_line + return mq, filtered_lines + +def filter_value(MQfile, header, filtered_prots, filter_value, ncol, opt): + mq = MQfile + if ncol and isnumber("int", ncol.replace("c", "")): #"Gene names" in columns: + index = int(ncol.replace("c", "")) - 1 #columns.index("Gene names") + else: + raise ValueError("Please specify the column where you would like to apply the filter with valid format") + + if header == "true": + header = mq[0] + content = mq[1:] + else: + header = "" + content = mq[:] + + if not filtered_prots: # In case there is already some filtered lines from other filters + filtered_prots = [] + if header != "": + filtered_prots.append(header) + + for prot in content: + filter_value = float(filter_value) + pep = prot.split("\t")[index].replace('"', "") + if pep.replace(".", "", 1).isdigit(): + if opt == "<": + if not float(pep) < filter_value: + filtered_prots.append(prot) + mq.remove(prot) + elif opt == "<=": + if not float(pep) <= filter_value: + filtered_prots.append(prot) + mq.remove(prot) + elif opt == ">": + #print(prot.number_of_prots, filter_value, int(prot.number_of_prots) > filter_value) + if not float(pep) > filter_value: + filtered_prots.append(prot) + mq.remove(prot) + elif opt == ">=": + if not float(pep) >= filter_value: + filtered_prots.append(prot) + mq.remove(prot) + else: + if not float(pep) == filter_value: + filtered_prots.append(prot) + mq.remove(prot) + return mq, filtered_prots #output, trash_file + +if __name__ == "__main__": + options()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filter_kw_val.xml Sun Nov 26 18:36:43 2017 -0500 @@ -0,0 +1,202 @@ +<tool id="MQoutputfilter" name="Filter out keywords and/or numerical values" version="0.1.0"> + <description>Filter a file by keywords or values</description> + <requirements> + </requirements> + <stdio> + <exit_code range="1:" /> + </stdio> + <command><![CDATA[ + python $__tool_directory__/filter_kw_val.py + -i "$input1,$header" + -o "$output1" + --trash_file "$trash_file" + + ## Keywords + #for $i, $key in enumerate($keyword) + #if $key.k.kw != "None" + #if $key.k.kw == "text" + --kw "$key.k.txt" "$key.k.ncol" "$key.match" + #else if $key.k.kw == "file" + --kw_file "$key.k.file" "$key.k.ncol" "$key.match" + #end if + #end if + #end for + + ## Number of proteins + #for $i, $val in enumerate($value) + #if $val.v.val != "None" + --value + #if $val.v.val == "Equal" + $val.v.equal "$value.ncol" "=" + #else if $val.v.val == "Higher" + $val.v.higher "$val.v.ncol" ">" + #else if $val.v.val == "Equal or higher" + $val.v.equal_higher "$val.v.ncol" ">=" + #else if $val.v.val == "Lower" + $val.v.lower "$val.v.ncol" "<" + #else + $val.v.equal_lower "$val.v.ncol" "<=" + #end if + #end if + #end for + + ]]></command> + <inputs> + <param type="data" name="input1" format="txt,tabular" label="Input file" help="Input file is a tab-delimited file containing proteomics results (e.g. output file from MaxQuant or Proline softwares" /> + <param name="header" type="boolean" checked="true" truevalue="true" falsevalue="false" label="Does your input file contain header?" /> + <repeat name="keyword" title="Filter by keywords" > + <param type="boolean" name="match" truevalue="True" label="Would you like to search for exact match?" help='Choosing "Yes" will only filter out exact match (i.e. case sensitive), see below for more detail' /> + <conditional name="k" > + <param argument="--kw" type="select" label="Filter by keyword" > + <option value="None" selected="True">---</option> + <option value="text">Enter keywords</option> + <option value="file">Choose a file containing keywords</option> + </param> + <when value="None" /> + <when value="text" > + <param name="txt" type="text" label="Enter keywords or a file containing keywords to be removed" > + <sanitizer> + <valid initial="string.printable"> + <remove value="'"/> + </valid> + <mapping initial="none"> + <add source="'" target="__sq__"/> + </mapping> + </sanitizer> + </param> + <param name="ncol" type="text" value="c1" label="Please specify the column where you would like to apply this filter" help='For example, fill in "c1" if you want to filter the first column' /> + </when> + <when value="file" > + <param name="file" type="data" format="txt,tabular" label="Choose a file containing keywords" /> + <param name="ncol" type="text" value="c1" label="Please specify the column on which to apply this filter" help='For example, fill in "c1" if the keyword you want to filter out is expected in the first column' /> + </when> + </conditional> + </repeat> + + <repeat name="value" title="Filter by value" > + <conditional name="v" > + <param argument="--val" type="select" label="Filter by value" > + <option value="None">---</option> + <option value="Equal">=</option> + <option value="Higher">></option> + <option value="Equal or higher">>=</option> + <option value="Lower"><</option> + <option value="Equal or lower"><=</option> + </param> + <when value="None" > + </when> + <when value="Equal" > + <param name="equal" type="float" value="" label="Value" /> + <param name="ncol" type="text" value="c1" label="Please specify the column where you would like to apply this filter" help='For example, fill in "c1" if you want to filter the first column' /> + </when> + <when value="Higher" > + <param type="float" name="higher" value="" label="Value" /> + <param name="ncol" type="text" value="c1" label="Please specify the column where you would like to apply this filter" help='For example, fill in "c1" if you want to filter the first column' /> + </when> + <when value="Equal or higher" > + <param type="float" name="equal_higher" value="" label="Value" /> + <param name="ncol" type="text" value="c1" label="Please specify the column where you would like to apply this filter" help='For example, fill in "c1" if you want to filter the first column' /> + </when> + <when value="Lower" > + <param type="float" name="lower" value="" label="Value" /> + <param name="ncol" type="text" value="c1" label="Please specify the column where you would like to apply this filter" help='For example, fill in "c1" if you want to filter the first column' /> + </when> + <when value="Equal or lower" > + <param type="float" name="equal_lower" value="" label="Value" /> + <param name="ncol" type="text" value="c1" label="Please specify the column where you would like to apply this filter" help='For example, fill in "c1" if you want to filter the first column' /> + </when> + </conditional> + </repeat> + + </inputs> + <outputs> + <data name="output1" format="tabular" label="${tool.name} on ${input1.name}" /> + <data name="trash_file" format="tabular" label="Removed proteins from input file" /> + </outputs> + <tests> + <test> + <param name="input1" value="UnipIDs.txt" /> + <param name="header" value="false" /> + <repeat name="keyword"> + <param name="match" value="false" /> + <conditional name="k"> + <param name="kw" value="text" /> + <param name="txt" value="A" /> + <param name="ncol" value="c3" /> + </conditional> + </repeat> + <repeat name="value"> + <conditional name="v"> + <param name="val" value="Equal or higher"/> + <param name="equal_higher" value="1.0" /> + <param name="ncol" value="c2" /> + </conditional> + </repeat> + <output name="output1" file="filter_keywords_values_output.txt" /> + <output name="trash_file" file="filter_keywords_values_removed.txt" /> + </test> + </tests> + <help><![CDATA[ +This tool allows to remove unneeded data (e.g. contaminants, non-significant values) from a proteomics results file (e.g. MaxQuant or Proline output). + +**For each row, if there are more than one protein IDs/protein names/gene names, only the first one will be considered in the output** + +**Filter the file by keywords** + +Several options can be used. For each option, you can fill in the field or upload a file which contains the keywords. + +- If you choose to fill in the field, the keywords should be separated by ":", for example: A8K2U0:Q5TA79:O43175 + +- If you choose to upload a file in a text format in which each line is a keyword, for example: + + REV + + TRYP_PIG + + ALDOA_RABBIT + +**The line that contains these keywords will be eliminated from input file.** + +**Keywords search can be applied by performing either exact match or partial one by using the following option** + +- If you choose **Yes**, only the fields that contains exactly the same content will be removed. + +- If you choose **No**, all the fields containing the keyword will be removed. + +For example: + +**Yes** option (exact match) selected using the keyword "kinase": only lines which contain exactly "kinase" is removed. + +**No** option (partial match) for "kinase": not only lines which contain "kinase" but also lines with "alpha-kinase" (and so on) are removed. + +**Filter the file by values** + +You can choose to use one or more options (e.g. to filter out peptides of low intensity value, by q-value, etc.). + +* For each option, you can choose between "=", ">", ">=", "<" and "<=", then enter the value to filter and specify the column to apply that option. + +**Output** + +The tool will produce 2 output files. + +* A text file containing the resulting filtered input file. + +* A text file containing the rows removed from the input file. + +----- + +.. class:: infomark + +**Authors** + +T.P. Lien Nguyen, Florence Combes, Yves Vandenbrouck CEA, INSERM, CNRS, Grenoble-Alpes University, BIG Institute, FR +Sandra Dérozier, Olivier Rué, Christophe Caron, Valentin Loux INRA, Paris-Saclay University, MAIAGE Unit, Migale Bioinformatics platform + +This work has been partially funded through the French National Agency for Research (ANR) IFB project. + +Contact support@proteore.org for any questions or concerns about the Galaxy implementation of this tool. + + ]]></help> + <citations> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/UnipIDs.txt Sun Nov 26 18:36:43 2017 -0500 @@ -0,0 +1,25 @@ +P04637 +P08246 +P63244 +P10275 +P00533 +Q14524 +P05067 +P35555 +P35222 +O95273 +P00451 +P38398 +Q05086 +Q12802 +P68871 +P04585 +Q96EB6 +Q9NYL2 +P31749 +P01137 +Q5S007 +Q08379 +P02649 +P35498 +P12931
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/filter_keywords_values_output.txt Sun Nov 26 18:36:43 2017 -0500 @@ -0,0 +1,14 @@ +P08246 2 B0 +P63244 1.5 C1 +Q14524 3.5 D1 +P05067 1 B3 +P00451 2 B2 +P38398 5 B4 +Q12802 3 D5 +P68871 1.5 B4 +P04585 2.5 D3 +Q9NYL2 1 B1 +P01137 5 B6 +Q5S007 8 D4 +Q08379 2 C4 +P35498 1 C5