Mercurial > repos > proteore > filter_keywords_values
changeset 5:1e9911190142 draft
planemo upload commit 08f1831e097df5d74bf60ff5955e7e9c8e524cc8-dirty
author | proteore |
---|---|
date | Wed, 14 Mar 2018 10:24:54 -0400 |
parents | 2c1012e0a628 |
children | c6ba1e6f6869 |
files | README.rst filter_kw_val.py filter_kw_val.xml |
diffstat | 3 files changed, 45 insertions(+), 39 deletions(-) [+] |
line wrap: on
line diff
--- a/README.rst Thu Mar 08 10:41:08 2018 -0500 +++ b/README.rst Wed Mar 14 10:24:54 2018 -0400 @@ -13,7 +13,7 @@ ------------------------------------------------------- -This tool allows to remove unneeded data (e.g. contaminants, non-significant values) from a proteomics results file (e.g. MaxQuant or Proline output). +This tool allows to filter out data according to your specific needs (e.g. contaminants, non-significant values or related to a particular annotation) from a proteomics results file (e.g. MaxQuant or Proline output). **For each row, if there are more than one protein IDs/protein names/gene names, only the first one will be considered in the output** @@ -31,17 +31,17 @@ ALDOA_RABBIT -**The line that contains these keywords will be eliminated from input file.** +**The line that contains these keywords will be filtered from input file and provided in a separate file.** **Keywords search can be applied by performing either exact match or partial one by using the following option** -- If you choose **Yes**, only the fields that contains exactly the same content will be removed. +- If you choose **Yes**, only the fields that contains exactly the same content will be filtered. -- If you choose **No**, all the fields containing the keyword will be removed. +- If you choose **No**, all the fields containing the keyword will be filtered. For example: -**Yes** option (exact match) selected using the keyword "kinase": only lines which contain exactly "kinase" is removed. +**Yes** option (exact match) selected using the keyword "kinase": only lines which contain exactly "kinase" is filtered (and not "Kinase"). **No** option (partial match) for "kinase": not only lines which contain "kinase" but also lines with "alpha-kinase" (and so on) are removed. @@ -57,4 +57,4 @@ * A text file containing the resulting filtered input file. -* A text file containing the rows removed from the input file. \ No newline at end of file +* A text file containing the rows that have been filtered from the input file.
--- a/filter_kw_val.py Thu Mar 08 10:41:08 2018 -0500 +++ b/filter_kw_val.py Wed Mar 14 10:24:54 2018 -0400 @@ -4,12 +4,22 @@ def options(): """ - Parse options + Parse options: + -i, --input Input filename and boolean value if the file contains header ["filename,true/false"] + -m, --match if the keywords should be filtered in exact + --kw Keyword to be filtered, the column number where this filter applies, + boolean value if the keyword should be filtered in exact ["keyword,ncol,true/false"]. + This option can be repeated: --kw "kw1,c1,true" --kw "kw2,c1,false" --kw "kw3,c2,true" + --kwfile A file that contains keywords to be filter, the column where this filter applies and + boolean value if the keyword should be filtered in exact ["filename,ncol,true/false"] + --value The value to be filtered, the column number where this filter applies and the + operation symbol ["value,ncol,=/>/>=/</<="] + --o --output The output filename + --trash_file The file contains removed lines """ parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", help="Input file", required=True) - parser.add_argument("-m", "--match", help="Exact macth") - parser.add_argument("--kw", nargs="+", action="append", help="") # + parser.add_argument("--kw", nargs="+", action="append", help="") parser.add_argument("--kw_file", nargs="+", action="append", help="") parser.add_argument("--value", nargs="+", action="append", help="") parser.add_argument("-o", "--output", default="output.txt") @@ -19,16 +29,12 @@ filters(args) - # python filter2.py -i "/projet/galaxydev/galaxy/tools/proteore_uc1/proteinGroups_Maud.txt" - # --protein_IDs "A2A288:A8K2U0" --peptides 2 "=" -o "test-data/output_MQfilter.txt" - - def isnumber(number_format, n): """ Check if a variable is a float or an integer """ - float_format = re.compile("^[\-]?[1-9][0-9]*\.?[0-9]+$") - int_format = re.compile("^[\-]?[1-9][0-9]*$") + float_format = re.compile(r"^[-]?[1-9][0-9]*.?[0-9]+$") + int_format = re.compile(r"^[-]?[1-9][0-9]*$") test = "" if number_format == "int": test = re.match(int_format, n) @@ -36,8 +42,6 @@ test = re.match(float_format, n) if test: return True -# else: -# return False def filters(args): """ @@ -66,15 +70,16 @@ # Write results to output output = open(args.output, "w") - output.write("\n".join(results[0])) + output.write("".join(results[0])) output.close() # Write deleted lines to trash_file trash = open(args.trash_file, "w") - trash.write("\n".join(results[1])) + trash.write("".join(results[1])) trash.close() def readOption(filename): + # Read the keywords file to extract the list of keywords f = open(filename, "r") file_content = f.read() filter_list = file_content.split("\n") @@ -85,7 +90,7 @@ return filters def readMQ(MQfilename): - # Read MQ file + # Read input file mqfile = open(MQfilename, "r") mq = mqfile.readlines() # Remove empty lines (contain only space or new line or "") @@ -95,7 +100,7 @@ def filter_keyword(MQfile, header, filtered_lines, ids, ncol, match): mq = MQfile if isnumber("int", ncol.replace("c", "")): - id_index = int(ncol.replace("c", "")) - 1 #columns.index("Majority protein IDs") + id_index = int(ncol.replace("c", "")) - 1 else: raise ValueError("Please specify the column where " "you would like to apply the filter " @@ -124,28 +129,29 @@ for line in content: line = line.replace("\n", "") id_inline = line.split("\t")[id_index].replace('"', "").split(";") - one_id_line = line.replace(line.split("\t")[id_index], id_inline[0]) # Take only first IDs + # Take only first IDs + #one_id_line = line.replace(line.split("\t")[id_index], id_inline[0]) line = line + "\n" if match != "false": # Filter protein IDs if any(pid.upper() in ids for pid in id_inline): - filtered_lines.append(one_id_line) + filtered_lines.append(line) mq.remove(line) - else: - mq[mq.index(line)] = one_id_line + #else: + # mq[mq.index(line)] = one_id_line else: if any(ft in pid.upper() for pid in id_inline for ft in ids): - filtered_lines.append(one_id_line) + filtered_lines.append(line) mq.remove(line) - else: - mq[mq.index(line)] = one_id_line + #else: + # mq[mq.index(line)] = one_id_line return mq, filtered_lines def filter_value(MQfile, header, filtered_prots, filter_value, ncol, opt): mq = MQfile - if ncol and isnumber("int", ncol.replace("c", "")): #"Gene names" in columns: - index = int(ncol.replace("c", "")) - 1 #columns.index("Gene names") + if ncol and isnumber("int", ncol.replace("c", "")): + index = int(ncol.replace("c", "")) - 1 else: raise ValueError("Please specify the column where " "you would like to apply the filter " @@ -187,7 +193,7 @@ if float(pep) != filter_value: filtered_prots.append(line) mq.remove(line) - return mq, filtered_prots #output, trash_file + return mq, filtered_prots if __name__ == "__main__": options()
--- a/filter_kw_val.xml Thu Mar 08 10:41:08 2018 -0500 +++ b/filter_kw_val.xml Wed Mar 14 10:24:54 2018 -0400 @@ -55,7 +55,7 @@ </param> <when value="None" /> <when value="text" > - <param name="txt" type="text" label="Copy/paste keywords to be removed" help='Keywords should be separated by ";", for example: A8K2U0;Q5TA79;O43175' > + <param name="txt" type="text" label="Copy/paste keywords to be filtered out" help='Keywords should be separated by ";", for example: A8K2U0;Q5TA79;O43175' > <sanitizer> <valid initial="string.printable"> <remove value="'"/> @@ -106,7 +106,7 @@ </inputs> <outputs> <data name="output1" format="tabular" label="${tool.name} on ${input1.name}" /> - <data name="trash_file" format="tabular" label="${tool.name} on ${input1.name} - Removed lines" /> + <data name="trash_file" format="tabular" label="${tool.name} on ${input1.name} - Filtered lines" /> </outputs> <tests> <test> @@ -125,7 +125,7 @@ </test> </tests> <help><![CDATA[ -This tool allows to remove unneeded data (e.g. contaminants, non-significant values) from a proteomics results file (e.g. MaxQuant or Proline output). +This tool allows to filter out data according to your specific needs (e.g. contaminants, non-significant values or related to a particular annotation) from a proteomics results file (e.g. MaxQuant or Proline output). **For each row, if there are more than one protein IDs/protein names/gene names, only the first one will be considered in the output** @@ -143,17 +143,17 @@ ALDOA_RABBIT -**The line that contains these keywords will be eliminated from input file.** +**The line that contains these keywords will be filtered from input file and provided in a separate file.** **Keywords search can be applied by performing either exact match or partial one by using the following option** -- If you choose **Yes**, only the fields that contains exactly the same content will be removed. +- If you choose **Yes**, only the fields that contains exactly the same content will be filtered. -- If you choose **No**, all the fields containing the keyword will be removed. +- If you choose **No**, all the fields containing the keyword will be filtered. For example: -**Yes** option (exact match) selected using the keyword "kinase": only lines which contain exactly "kinase" is removed. +**Yes** option (exact match) selected using the keyword "kinase": only lines which contain exactly "kinase" is filtered (and not "Kinase"). **No** option (partial match) for "kinase": not only lines which contain "kinase" but also lines with "alpha-kinase" (and so on) are removed. @@ -169,7 +169,7 @@ * A text file containing the resulting filtered input file. -* A text file containing the rows removed from the input file. +* A text file containing the rows that have been filtered from the input file. -----