Mercurial > repos > proteore > filter_keywords_values

--- a/README.rst	Thu Mar 08 10:41:08 2018 -0500
+++ b/README.rst	Wed Mar 14 10:24:54 2018 -0400
@@ -13,7 +13,7 @@

 -------------------------------------------------------

-This tool allows to remove unneeded data (e.g. contaminants, non-significant values) from a proteomics results file (e.g. MaxQuant or Proline output).
+This tool allows to filter out data according to your specific needs (e.g. contaminants, non-significant values or related to a particular annotation) from a proteomics results file (e.g. MaxQuant or Proline output).

 **For each row, if there are more than one protein IDs/protein names/gene names, only the first one will be considered in the output**

@@ -31,17 +31,17 @@

 ALDOA_RABBIT

-**The line that contains these keywords will be eliminated from input file.**
+**The line that contains these keywords will be filtered from input file and provided in a separate file.**

 **Keywords search can be applied by performing either exact match or partial one by using the following option**

-- If you choose **Yes**, only the fields that contains exactly the same content will be removed.
+- If you choose **Yes**, only the fields that contains exactly the same content will be filtered.

-- If you choose **No**, all the fields containing the keyword will be removed.
+- If you choose **No**, all the fields containing the keyword will be filtered.

 For example:

-**Yes** option (exact match) selected using the keyword "kinase": only lines which contain exactly "kinase" is removed.
+**Yes** option (exact match) selected using the keyword "kinase": only lines which contain exactly "kinase" is filtered (and not "Kinase").

 **No** option (partial match) for "kinase": not only lines which contain "kinase" but also lines with "alpha-kinase" (and so  on) are removed.

@@ -57,4 +57,4 @@

 * A text file containing the resulting filtered input file.

-* A text file containing the rows removed from the input file.
\ No newline at end of file
+* A text file containing the rows that have been filtered from the input file.
--- a/filter_kw_val.py	Thu Mar 08 10:41:08 2018 -0500
+++ b/filter_kw_val.py	Wed Mar 14 10:24:54 2018 -0400
@@ -4,12 +4,22 @@

 def options():
     """
-    Parse options
+    Parse options:
+        -i, --input     Input filename and boolean value if the file contains header ["filename,true/false"]
+        -m, --match     if the keywords should be filtered in exact
+        --kw            Keyword to be filtered, the column number where this filter applies,
+                        boolean value if the keyword should be filtered in exact ["keyword,ncol,true/false"].
+                        This option can be repeated: --kw "kw1,c1,true" --kw "kw2,c1,false" --kw "kw3,c2,true"
+        --kwfile        A file that contains keywords to be filter, the column where this filter applies and
+                        boolean value if the keyword should be filtered in exact ["filename,ncol,true/false"]
+        --value         The value to be filtered, the column number where this filter applies and the
+                        operation symbol ["value,ncol,=/>/>=/</<="]
+        --o --output    The output filename
+        --trash_file    The file contains removed lines
     """
     parser = argparse.ArgumentParser()
     parser.add_argument("-i", "--input", help="Input file", required=True)
-    parser.add_argument("-m", "--match", help="Exact macth")
-    parser.add_argument("--kw", nargs="+", action="append", help="") #
+    parser.add_argument("--kw", nargs="+", action="append", help="")
     parser.add_argument("--kw_file", nargs="+", action="append", help="")
     parser.add_argument("--value", nargs="+", action="append", help="")
     parser.add_argument("-o", "--output", default="output.txt")
@@ -19,16 +29,12 @@

     filters(args)

-    # python filter2.py -i "/projet/galaxydev/galaxy/tools/proteore_uc1/proteinGroups_Maud.txt"
-    # --protein_IDs "A2A288:A8K2U0" --peptides 2 "=" -o "test-data/output_MQfilter.txt"
-
-
 def isnumber(number_format, n):
     """
     Check if a variable is a float or an integer
     """
-    float_format = re.compile("^[\-]?[1-9][0-9]*\.?[0-9]+$")
-    int_format = re.compile("^[\-]?[1-9][0-9]*$")
+    float_format = re.compile(r"^[-]?[1-9][0-9]*.?[0-9]+$")
+    int_format = re.compile(r"^[-]?[1-9][0-9]*$")
     test = ""
     if number_format == "int":
         test = re.match(int_format, n)
@@ -36,8 +42,6 @@
         test = re.match(float_format, n)
     if test:
         return True
-#    else:
-#        return False

 def filters(args):
     """
@@ -66,15 +70,16 @@

     # Write results to output
     output = open(args.output, "w")
-    output.write("\n".join(results[0]))
+    output.write("".join(results[0]))
     output.close()

     # Write deleted lines to trash_file
     trash = open(args.trash_file, "w")
-    trash.write("\n".join(results[1]))
+    trash.write("".join(results[1]))
     trash.close()

 def readOption(filename):
+    # Read the keywords file to extract the list of keywords
     f = open(filename, "r")
     file_content = f.read()
     filter_list = file_content.split("\n")
@@ -85,7 +90,7 @@
     return filters

 def readMQ(MQfilename):
-    # Read MQ file
+    # Read input file
     mqfile = open(MQfilename, "r")
     mq = mqfile.readlines()
     # Remove empty lines (contain only space or new line or "")
@@ -95,7 +100,7 @@
 def filter_keyword(MQfile, header, filtered_lines, ids, ncol, match):
     mq = MQfile
     if isnumber("int", ncol.replace("c", "")):
-        id_index = int(ncol.replace("c", "")) - 1 #columns.index("Majority protein IDs")
+        id_index = int(ncol.replace("c", "")) - 1
     else:
         raise ValueError("Please specify the column where "
                          "you would like to apply the filter "
@@ -124,28 +129,29 @@
     for line in content:
         line = line.replace("\n", "")
         id_inline = line.split("\t")[id_index].replace('"', "").split(";")
-        one_id_line = line.replace(line.split("\t")[id_index], id_inline[0]) # Take only first IDs
+        # Take only first IDs
+        #one_id_line = line.replace(line.split("\t")[id_index], id_inline[0])
         line = line + "\n"

         if match != "false":
             # Filter protein IDs
             if any(pid.upper() in ids for pid in id_inline):
-                filtered_lines.append(one_id_line)
+                filtered_lines.append(line)
                 mq.remove(line)
-            else:
-                mq[mq.index(line)] = one_id_line
+            #else:
+            #    mq[mq.index(line)] = one_id_line
         else:
             if any(ft in pid.upper() for pid in id_inline for ft in ids):
-                filtered_lines.append(one_id_line)
+                filtered_lines.append(line)
                 mq.remove(line)
-            else:
-                mq[mq.index(line)] = one_id_line
+            #else:
+            #    mq[mq.index(line)] = one_id_line
     return mq, filtered_lines

 def filter_value(MQfile, header, filtered_prots, filter_value, ncol, opt):
     mq = MQfile
-    if ncol and isnumber("int", ncol.replace("c", "")): #"Gene names" in columns:
-        index = int(ncol.replace("c", "")) - 1 #columns.index("Gene names")
+    if ncol and isnumber("int", ncol.replace("c", "")):
+        index = int(ncol.replace("c", "")) - 1
     else:
         raise ValueError("Please specify the column where "
                          "you would like to apply the filter "
@@ -187,7 +193,7 @@
                 if float(pep) != filter_value:
                     filtered_prots.append(line)
                     mq.remove(line)
-    return mq, filtered_prots #output, trash_file
+    return mq, filtered_prots

 if __name__ == "__main__":
     options()
--- a/filter_kw_val.xml	Thu Mar 08 10:41:08 2018 -0500
+++ b/filter_kw_val.xml	Wed Mar 14 10:24:54 2018 -0400
@@ -55,7 +55,7 @@
                 </param>
                 <when value="None" />
                 <when value="text" >
-                    <param name="txt" type="text" label="Copy/paste keywords to be removed" help='Keywords should be separated by ";", for example: A8K2U0;Q5TA79;O43175' >
+                    <param name="txt" type="text" label="Copy/paste keywords to be filtered out" help='Keywords should be separated by ";", for example: A8K2U0;Q5TA79;O43175' >
                         <sanitizer>
                         <valid initial="string.printable">
                             <remove value="&apos;"/>
@@ -106,7 +106,7 @@
     </inputs>
     <outputs>
         <data name="output1" format="tabular" label="${tool.name} on ${input1.name}" />
-        <data name="trash_file" format="tabular" label="${tool.name} on ${input1.name} - Removed lines" />
+        <data name="trash_file" format="tabular" label="${tool.name} on ${input1.name} - Filtered lines" />
     </outputs>
     <tests>
         <test>
@@ -125,7 +125,7 @@
         </test>
     </tests>
     <help><![CDATA[
-This tool allows to remove unneeded data (e.g. contaminants, non-significant values) from a proteomics results file (e.g. MaxQuant or Proline output).
+This tool allows to filter out data according to your specific needs (e.g. contaminants, non-significant values or related to a particular annotation) from a proteomics results file (e.g. MaxQuant or Proline output).

 **For each row, if there are more than one protein IDs/protein names/gene names, only the first one will be considered in the output**

@@ -143,17 +143,17 @@

 ALDOA_RABBIT

-**The line that contains these keywords will be eliminated from input file.**
+**The line that contains these keywords will be filtered from input file and provided in a separate file.**

 **Keywords search can be applied by performing either exact match or partial one by using the following option**

-- If you choose **Yes**, only the fields that contains exactly the same content will be removed.
+- If you choose **Yes**, only the fields that contains exactly the same content will be filtered.

-- If you choose **No**, all the fields containing the keyword will be removed.
+- If you choose **No**, all the fields containing the keyword will be filtered.

 For example:

-**Yes** option (exact match) selected using the keyword "kinase": only lines which contain exactly "kinase" is removed.
+**Yes** option (exact match) selected using the keyword "kinase": only lines which contain exactly "kinase" is filtered (and not "Kinase").

 **No** option (partial match) for "kinase": not only lines which contain "kinase" but also lines with "alpha-kinase" (and so  on) are removed.

@@ -169,7 +169,7 @@

 * A text file containing the resulting filtered input file.

-* A text file containing the rows removed from the input file.
+* A text file containing the rows that have been filtered from the input file.

 -----