# HG changeset patch
# User proteore
# Date 1521037494 14400
# Node ID 1e99111901424e224aacebb860a816858257b19e
# Parent 2c1012e0a628b13e338875b5e3e5a6363f38d4c6
planemo upload commit 08f1831e097df5d74bf60ff5955e7e9c8e524cc8-dirty
diff -r 2c1012e0a628 -r 1e9911190142 README.rst
--- a/README.rst Thu Mar 08 10:41:08 2018 -0500
+++ b/README.rst Wed Mar 14 10:24:54 2018 -0400
@@ -13,7 +13,7 @@
-------------------------------------------------------
-This tool allows to remove unneeded data (e.g. contaminants, non-significant values) from a proteomics results file (e.g. MaxQuant or Proline output).
+This tool allows to filter out data according to your specific needs (e.g. contaminants, non-significant values or related to a particular annotation) from a proteomics results file (e.g. MaxQuant or Proline output).
**For each row, if there are more than one protein IDs/protein names/gene names, only the first one will be considered in the output**
@@ -31,17 +31,17 @@
ALDOA_RABBIT
-**The line that contains these keywords will be eliminated from input file.**
+**The line that contains these keywords will be filtered from input file and provided in a separate file.**
**Keywords search can be applied by performing either exact match or partial one by using the following option**
-- If you choose **Yes**, only the fields that contains exactly the same content will be removed.
+- If you choose **Yes**, only the fields that contains exactly the same content will be filtered.
-- If you choose **No**, all the fields containing the keyword will be removed.
+- If you choose **No**, all the fields containing the keyword will be filtered.
For example:
-**Yes** option (exact match) selected using the keyword "kinase": only lines which contain exactly "kinase" is removed.
+**Yes** option (exact match) selected using the keyword "kinase": only lines which contain exactly "kinase" is filtered (and not "Kinase").
**No** option (partial match) for "kinase": not only lines which contain "kinase" but also lines with "alpha-kinase" (and so on) are removed.
@@ -57,4 +57,4 @@
* A text file containing the resulting filtered input file.
-* A text file containing the rows removed from the input file.
\ No newline at end of file
+* A text file containing the rows that have been filtered from the input file.
diff -r 2c1012e0a628 -r 1e9911190142 filter_kw_val.py
--- a/filter_kw_val.py Thu Mar 08 10:41:08 2018 -0500
+++ b/filter_kw_val.py Wed Mar 14 10:24:54 2018 -0400
@@ -4,12 +4,22 @@
def options():
"""
- Parse options
+ Parse options:
+ -i, --input Input filename and boolean value if the file contains header ["filename,true/false"]
+ -m, --match if the keywords should be filtered in exact
+ --kw Keyword to be filtered, the column number where this filter applies,
+ boolean value if the keyword should be filtered in exact ["keyword,ncol,true/false"].
+ This option can be repeated: --kw "kw1,c1,true" --kw "kw2,c1,false" --kw "kw3,c2,true"
+ --kwfile A file that contains keywords to be filter, the column where this filter applies and
+ boolean value if the keyword should be filtered in exact ["filename,ncol,true/false"]
+ --value The value to be filtered, the column number where this filter applies and the
+ operation symbol ["value,ncol,=/>/>=/<="]
+ --o --output The output filename
+ --trash_file The file contains removed lines
"""
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--input", help="Input file", required=True)
- parser.add_argument("-m", "--match", help="Exact macth")
- parser.add_argument("--kw", nargs="+", action="append", help="") #
+ parser.add_argument("--kw", nargs="+", action="append", help="")
parser.add_argument("--kw_file", nargs="+", action="append", help="")
parser.add_argument("--value", nargs="+", action="append", help="")
parser.add_argument("-o", "--output", default="output.txt")
@@ -19,16 +29,12 @@
filters(args)
- # python filter2.py -i "/projet/galaxydev/galaxy/tools/proteore_uc1/proteinGroups_Maud.txt"
- # --protein_IDs "A2A288:A8K2U0" --peptides 2 "=" -o "test-data/output_MQfilter.txt"
-
-
def isnumber(number_format, n):
"""
Check if a variable is a float or an integer
"""
- float_format = re.compile("^[\-]?[1-9][0-9]*\.?[0-9]+$")
- int_format = re.compile("^[\-]?[1-9][0-9]*$")
+ float_format = re.compile(r"^[-]?[1-9][0-9]*.?[0-9]+$")
+ int_format = re.compile(r"^[-]?[1-9][0-9]*$")
test = ""
if number_format == "int":
test = re.match(int_format, n)
@@ -36,8 +42,6 @@
test = re.match(float_format, n)
if test:
return True
-# else:
-# return False
def filters(args):
"""
@@ -66,15 +70,16 @@
# Write results to output
output = open(args.output, "w")
- output.write("\n".join(results[0]))
+ output.write("".join(results[0]))
output.close()
# Write deleted lines to trash_file
trash = open(args.trash_file, "w")
- trash.write("\n".join(results[1]))
+ trash.write("".join(results[1]))
trash.close()
def readOption(filename):
+ # Read the keywords file to extract the list of keywords
f = open(filename, "r")
file_content = f.read()
filter_list = file_content.split("\n")
@@ -85,7 +90,7 @@
return filters
def readMQ(MQfilename):
- # Read MQ file
+ # Read input file
mqfile = open(MQfilename, "r")
mq = mqfile.readlines()
# Remove empty lines (contain only space or new line or "")
@@ -95,7 +100,7 @@
def filter_keyword(MQfile, header, filtered_lines, ids, ncol, match):
mq = MQfile
if isnumber("int", ncol.replace("c", "")):
- id_index = int(ncol.replace("c", "")) - 1 #columns.index("Majority protein IDs")
+ id_index = int(ncol.replace("c", "")) - 1
else:
raise ValueError("Please specify the column where "
"you would like to apply the filter "
@@ -124,28 +129,29 @@
for line in content:
line = line.replace("\n", "")
id_inline = line.split("\t")[id_index].replace('"', "").split(";")
- one_id_line = line.replace(line.split("\t")[id_index], id_inline[0]) # Take only first IDs
+ # Take only first IDs
+ #one_id_line = line.replace(line.split("\t")[id_index], id_inline[0])
line = line + "\n"
if match != "false":
# Filter protein IDs
if any(pid.upper() in ids for pid in id_inline):
- filtered_lines.append(one_id_line)
+ filtered_lines.append(line)
mq.remove(line)
- else:
- mq[mq.index(line)] = one_id_line
+ #else:
+ # mq[mq.index(line)] = one_id_line
else:
if any(ft in pid.upper() for pid in id_inline for ft in ids):
- filtered_lines.append(one_id_line)
+ filtered_lines.append(line)
mq.remove(line)
- else:
- mq[mq.index(line)] = one_id_line
+ #else:
+ # mq[mq.index(line)] = one_id_line
return mq, filtered_lines
def filter_value(MQfile, header, filtered_prots, filter_value, ncol, opt):
mq = MQfile
- if ncol and isnumber("int", ncol.replace("c", "")): #"Gene names" in columns:
- index = int(ncol.replace("c", "")) - 1 #columns.index("Gene names")
+ if ncol and isnumber("int", ncol.replace("c", "")):
+ index = int(ncol.replace("c", "")) - 1
else:
raise ValueError("Please specify the column where "
"you would like to apply the filter "
@@ -187,7 +193,7 @@
if float(pep) != filter_value:
filtered_prots.append(line)
mq.remove(line)
- return mq, filtered_prots #output, trash_file
+ return mq, filtered_prots
if __name__ == "__main__":
options()
diff -r 2c1012e0a628 -r 1e9911190142 filter_kw_val.xml
--- a/filter_kw_val.xml Thu Mar 08 10:41:08 2018 -0500
+++ b/filter_kw_val.xml Wed Mar 14 10:24:54 2018 -0400
@@ -55,7 +55,7 @@
-
+
@@ -106,7 +106,7 @@
-
+
@@ -125,7 +125,7 @@