Mercurial > repos > lnguyen > filter_keywords_values

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/filter_kw_val.py	Fri Sep 15 09:03:45 2017 -0400
@@ -0,0 +1,209 @@
+import argparse
+import re
+
+def options():
+    """
+    Parse arguments:
+        -i, --input: Input file (text, tabular)
+        -m, --match: For keyword filter, if we filter for exact
+        --kw: keywords to filter out
+        --kw_file: file containing keywords to filter out
+        --value: value to filter
+        -o, --output: output filename, default is output.txt
+        --trash_file: extra output file containing removed lines
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-i", "--input", help="Input file", required=True)
+    parser.add_argument("-m", "--match", help="Exact macth")
+    parser.add_argument("--kw", nargs="+", action="append", help="") #
+    parser.add_argument("--kw_file", nargs="+", action="append", help="")
+    parser.add_argument("--value", nargs="+", action="append", help="")
+    parser.add_argument("-o", "--output", default="output.txt")
+    parser.add_argument("--trash_file", default="trash_MQfilter.txt")
+
+    args = parser.parse_args()
+
+    filters(args)
+
+def isnumber(format, n):
+    # Check if an element is integer or float
+    float_format = re.compile("^[\-]?[1-9][0-9]*\.?[0-9]+$")
+    int_format = re.compile("^[\-]?[1-9][0-9]*$")
+    test = ""
+    if format == "int":
+        test = re.match(int_format, n)
+    elif format == "float":
+        test = re.match(float_format, n)
+    if test:
+        return True
+    else:
+        return False
+
+def filters(args):
+    """
+    Extract filter arguments
+    """
+
+    # Read input file
+    MQfilename = args.input.split(",")[0]
+    header = args.input.split(",")[1]
+    MQfile = readMQ(MQfilename)
+    results = [MQfile, None]
+
+    # Extract keyword arguments
+    if args.kw:
+        keywords = args.kw
+        for k in keywords:
+            results = filter_keyword(results[0], header, results[1], k[0], k[1], k[2])
+    if args.kw_file:
+        key_files = args.kw_file
+        for kf in key_files:
+            ids = readOption(kf[0])
+            results = filter_keyword(results[0], header, results[1], ids, kf[1], kf[2])
+
+    # Extract value arguments
+    if args.value:
+        for v in args.value:
+            if isnumber("float", v[0]):
+                results = filter_value(results[0], header, results[1], v[0], v[1], v[2])
+            else:
+                raise ValueError("Please enter a number in filter by value")
+
+    # Write results to output
+    output = open(args.output, "w")
+    output.write("".join(results[0]))
+    output.close()
+
+    # Write deleted lines to trash_file
+    trash = open(args.trash_file, "w")
+    trash.write("".join(results[1]))
+    trash.close()
+
+def readOption(filename):
+    """
+    Read file containing keywords to filter out
+    """
+    f = open(filename, "r")
+    file = f.read()
+    filter_list = file.split("\n")
+    filters = ""
+    for i in filter_list:
+        filters += i + ":"
+    filters = filters[:-1]
+    return filters
+
+def readMQ(MQfilename):
+    """
+    Read input file and return list of file's lines
+    """
+    # Read input file
+    mqfile = open(MQfilename, "r")
+    mq = mqfile.readlines()
+    # Remove empty lines (contain only space or new line or "")
+    [mq.remove(blank) for blank in mq if blank.isspace() or blank == ""]
+    return mq
+
+def filter_keyword(MQfile, header, filtered_lines, kws, ncol, match):
+    """
+    Filter keywords
+    """
+    mq = MQfile
+
+    # Check if column number is in right form
+    if isnumber("int", ncol.replace("c", "")):
+        id_index = int(ncol.replace("c", "")) - 1
+    else:
+        raise ValueError("Please specify the column where you would like to apply the filter with valid format")
+
+    # Extract list of keywords to filter out
+    kws = kws.upper().split(":")
+    [kws.remove(blank) for blank in kws if blank.isspace() or blank == ""]
+
+    # Separate header and content of input file
+    if header == "true":
+        header = mq[0]
+        content = mq[1:]
+    else:
+        header = ""
+        content = mq[:]
+
+    # List of lines removed from input file
+    if not filtered_lines: # In case there is already some filtered lines from other filters
+        filtered_lines = []
+        if header != "":
+            filtered_lines.append(header)
+
+    # Filter out the lines containing keywords
+    for line in content:
+        id_inline = line.split("\t")[id_index].replace('"', "").split(";")
+        one_id_line = line.replace(line.split("\t")[id_index], id_inline[0]) # Take only first IDs
+
+        if match != "false":
+            if any (pid.upper() in kws for pid in id_inline):
+                filtered_lines.append(one_id_line)
+                mq.remove(line)
+            else:
+                mq[mq.index(line)] = one_id_line
+        else:
+            if any (ft in pid.upper() for pid in id_inline for ft in kws):
+                filtered_lines.append(one_id_line)
+                mq.remove(line)
+            else:
+                mq[mq.index(line)] = one_id_line
+    return mq, filtered_lines
+
+def filter_value(MQfile, header, filtered_prots, filter_value, ncol, opt):
+    """
+    Filter values
+    """
+    mq = MQfile
+
+    # Check if column number is in right form
+    if ncol and isnumber("int", ncol.replace("c", "")): #"Gene names" in columns:
+        index = int(ncol.replace("c", "")) - 1 #columns.index("Gene names")
+    else:
+        raise ValueError("Please specify the column where you would like to apply the filter with valid format")
+
+    # Separate header and content of input file
+    if header == "true":
+        header = mq[0]
+        content = mq[1:]
+    else:
+        header = ""
+        content = mq[:]
+
+    # List of lines removed from input file
+    if not filtered_prots: # In case there is already some filtered lines from other filters
+        filtered_prots = []
+        if header != "":
+            filtered_prots.append(header)
+
+    # Filter out the lines meet filter conditions
+    for prot in content:
+        filter_value = float(filter_value)
+        pep = prot.split("\t")[index].replace('"', "")
+        if pep.replace(".", "", 1).isdigit():
+            if opt == "<":
+                if not float(pep) < filter_value:
+                    filtered_prots.append(prot)
+                    mq.remove(prot)
+            elif opt == "<=":
+                if not float(pep) <= filter_value:
+                    filtered_prots.append(prot)
+                    mq.remove(prot)
+            elif opt == ">":
+                if not float(pep) > filter_value:
+                    filtered_prots.append(prot)
+                    mq.remove(prot)
+            elif opt == ">=":
+                if not float(pep) >= filter_value:
+                    filtered_prots.append(prot)
+                    mq.remove(prot)
+            else:
+                if not float(pep) == filter_value:
+                    filtered_prots.append(prot)
+                    mq.remove(prot)
+    return mq, filtered_prots
+
+if __name__ == "__main__":
+    options()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/filter_kw_val.xml	Fri Sep 15 09:03:45 2017 -0400
@@ -0,0 +1,202 @@
+<tool id="MQoutputfilter" name="Filter out keywords and/or numerical values" version="0.1.0">
+    <description>Filter a file by keywords or values</description>
+    <requirements>
+    </requirements>
+    <stdio>
+        <exit_code range="1:" />
+    </stdio>
+    <command><![CDATA[
+        python $__tool_directory__/filter_kw_val.py
+        -i "$input1,$header"
+        -o "$output1"
+        --trash_file "$trash_file"
+
+        ## Keywords
+        #for $i, $key in enumerate($keyword)
+            #if $key.k.kw != "None"
+                #if $key.k.kw == "text"
+                    --kw "$key.k.txt" "$key.k.ncol" "$key.match"
+                #else if $key.k.kw == "file"
+                    --kw_file "$key.k.file" "$key.k.ncol" "$key.match"
+                #end if
+            #end if
+        #end for
+
+        ## Number of proteins
+        #for $i, $val in enumerate($value)
+            #if $val.v.val != "None"
+                --value
+                #if $val.v.val == "Equal"
+                    $val.v.equal "$value.ncol" "="
+                #else if $val.v.val == "Higher"
+                    $val.v.higher "$val.v.ncol" ">"
+                #else if $val.v.val == "Equal or higher"
+                    $val.v.equal_higher "$val.v.ncol" ">="
+                #else if $val.v.val == "Lower"
+                    $val.v.lower "$val.v.ncol" "<"
+                #else
+                    $val.v.equal_lower "$val.v.ncol" "<="
+                #end if
+            #end if
+        #end for
+
+    ]]></command>
+    <inputs>
+        <param type="data" name="input1" format="txt,tabular" label="Input file" help="Input file is a tab-delimited file containing proteomics results (e.g. output file from MaxQuant or Proline softwares" />
+        <param name="header" type="boolean" checked="true" truevalue="true" falsevalue="false" label="Does your input file contain header?" />
+        <repeat name="keyword" title="Filter by keywords" >
+            <param type="boolean" name="match" truevalue="True" label="Would you like to search for exact match?" help='Choosing "Yes" will only filter out exact match (i.e. case sensitive), see below for more detail' />
+            <conditional name="k" >
+                <param argument="--kw" type="select" label="Filter by keyword" >
+                    <option value="None" selected="True">---</option>
+                    <option value="text">Enter keywords</option>
+                    <option value="file">Choose a file containing keywords</option>
+                </param>
+                <when value="None" />
+                <when value="text" >
+                    <param name="txt" type="text" label="Enter keywords or a file containing keywords to be removed" >
+                        <sanitizer>
+                        <valid initial="string.printable">
+                            <remove value="&apos;"/>
+                        </valid>
+                        <mapping initial="none">
+                            <add source="&apos;" target="__sq__"/>
+                        </mapping>
+                        </sanitizer>
+                    </param>
+                    <param name="ncol" type="text" value="c1" label="Please specify the column where you would like to apply this filter" help='For example, fill in "c1" if you want to filter the first column' />
+                </when>
+                <when value="file" >
+                    <param name="file" type="data" format="txt,tabular" label="Choose a file containing keywords" />
+                    <param name="ncol" type="text" value="c1" label="Please specify the column on which to apply this filter" help='For example, fill in "c1" if the keyword you want to filter out is expected in the first column' />
+                </when>
+            </conditional>
+        </repeat>
+
+        <repeat name="value" title="Filter by value" >
+            <conditional name="v" >
+                <param argument="--val" type="select" label="Filter by value" >
+                    <option value="None">---</option>
+                    <option value="Equal">=</option>
+                    <option value="Higher">&gt;</option>
+                    <option value="Equal or higher">&gt;=</option>
+                    <option value="Lower">&lt;</option>
+                    <option value="Equal or lower">&lt;=</option>
+                </param>
+                <when value="None" >
+                </when>
+                <when value="Equal" >
+                    <param name="equal" type="float" value="" label="Value" />
+                    <param name="ncol" type="text" value="c1" label="Please specify the column where you would like to apply this filter" help='For example, fill in "c1" if you want to filter the first column' />
+                </when>
+                <when value="Higher" >
+                    <param type="float" name="higher" value="" label="Value" />
+                    <param name="ncol" type="text" value="c1" label="Please specify the column where you would like to apply this filter" help='For example, fill in "c1" if you want to filter the first column' />
+                </when>
+                <when value="Equal or higher" >
+                    <param type="float" name="equal_higher" value="" label="Value" />
+                    <param name="ncol" type="text" value="c1" label="Please specify the column where you would like to apply this filter" help='For example, fill in "c1" if you want to filter the first column' />
+                </when>
+                <when value="Lower" >
+                    <param type="float" name="lower" value="" label="Value" />
+                    <param name="ncol" type="text" value="c1" label="Please specify the column where you would like to apply this filter" help='For example, fill in "c1" if you want to filter the first column' />
+                </when>
+                <when value="Equal or lower" >
+                    <param type="float" name="equal_lower" value="" label="Value" />
+                    <param name="ncol" type="text" value="c1" label="Please specify the column where you would like to apply this filter" help='For example, fill in "c1" if you want to filter the first column' />
+                </when>
+            </conditional>
+        </repeat>
+
+    </inputs>
+    <outputs>
+        <data name="output1" format="tabular" label="${tool.name} on ${input1.name}" />
+        <data name="trash_file" format="tabular" label="Removed proteins from input file" />
+    </outputs>
+    <tests>
+        <test>
+            <param name="input1" value="UnipIDs.txt" />
+            <param name="header" value="false" />
+            <repeat name="keyword">
+                <param name="match" value="false" />
+                <conditional name="k">
+                    <param name="kw" value="text" />
+                    <param name="txt" value="A" />
+                    <param name="ncol" value="c3" />
+                </conditional>
+            </repeat>
+            <repeat name="value">
+                <conditional name="v">
+                    <param name="val" value="Equal or higher"/>
+                    <param name="equal_higher" value="1.0" />
+                    <param name="ncol" value="c2" />
+                </conditional>
+            </repeat>
+            <output name="output1" file="filter_keywords_values_output.txt" />
+            <output name="trash_file" file="filter_keywords_values_removed.txt" />
+        </test>
+    </tests>
+    <help><![CDATA[
+This tool allows to remove unneeded data (e.g. contaminants, non-significant values) from a proteomics results file (e.g. MaxQuant or Proline output).
+
+**For each row, if there are more than one protein IDs/protein names/gene names, only the first one will be considered in the output**
+
+**Filter the file by keywords**
+
+Several options can be used. For each option, you can fill in the field or upload a file which contains the keywords.
+
+- If you choose to fill in the field, the keywords should be separated by ":", for example: A8K2U0:Q5TA79:O43175
+
+- If you choose to upload a file in a text format in which each line is a keyword, for example:
+
+ REV
+
+ TRYP_PIG
+
+ ALDOA_RABBIT
+
+**The line that contains these keywords will be eliminated from input file.**
+
+**Keywords search can be applied by performing either exact match or partial one by using the following option**
+
+- If you choose **Yes**, only the fields that contains exactly the same content will be removed.
+
+- If you choose **No**, all the fields containing the keyword will be removed.
+
+For example:
+
+**Yes** option (exact match) selected using the keyword "kinase": only lines which contain exactly "kinase" is removed.
+
+**No** option (partial match) for "kinase": not only lines which contain "kinase" but also lines with "alpha-kinase" (and so  on) are removed.
+
+**Filter the file by values**
+
+You can choose to use one or more options (e.g. to filter out peptides of low intensity value, by q-value, etc.).
+
+* For each option, you can choose between "=", ">", ">=", "<" and "<=", then enter the value to filter and specify the column to apply that option.
+
+**Output**
+
+The tool will produce 2 output files.
+
+* A text file containing the resulting filtered input file.
+
+* A text file containing the rows removed from the input file.
+
+-----
+
+.. class:: infomark
+
+**Authors**
+
+T.P. Lien Nguyen, Florence Combes, Yves Vandenbrouck CEA, INSERM, CNRS, Grenoble-Alpes University, BIG Institute, FR
+Sandra Dérozier, Olivier Rué, Christophe Caron, Valentin Loux INRA, Paris-Saclay University, MAIAGE Unit, Migale Bioinformatics platform
+
+This work has been partially funded through the French National Agency for Research (ANR) IFB project.
+
+Contact support@proteore.org for any questions or concerns about the Galaxy implementation of this tool.
+
+    ]]></help>
+    <citations>
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/UnipIDs.txt	Fri Sep 15 09:03:45 2017 -0400
@@ -0,0 +1,25 @@
+P04637	1	A0
+P08246	2	B0
+P63244	1.5	C1
+P10275	3	A2
+P00533	2	A3
+Q14524	3.5	D1
+P05067	1	B3
+P35555	0	C0
+P35222	0.9	D2
+O95273	1.1	A4
+P00451	2	B2
+P38398	5	B4
+Q05086	0	C2
+Q12802	3	D5
+P68871	1.5	B4
+P04585	2.5	D3
+Q96EB6	0	C3
+Q9NYL2	1	B1
+P31749	3	A1
+P01137	5	B6
+Q5S007	8	D4
+Q08379	2	C4
+P02649	0	B5
+P35498	1	C5
+P12931	3	A5
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/filter_keywords_values_output.txt	Fri Sep 15 09:03:45 2017 -0400
@@ -0,0 +1,14 @@
+P08246	2	B0
+P63244	1.5	C1
+Q14524	3.5	D1
+P05067	1	B3
+P00451	2	B2
+P38398	5	B4
+Q12802	3	D5
+P68871	1.5	B4
+P04585	2.5	D3
+Q9NYL2	1	B1
+P01137	5	B6
+Q5S007	8	D4
+Q08379	2	C4
+P35498	1	C5
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/filter_keywords_values_removed.txt	Fri Sep 15 09:03:45 2017 -0400
@@ -0,0 +1,11 @@
+P04637	1	A0
+P10275	3	A2
+P00533	2	A3
+O95273	1.1	A4
+P31749	3	A1
+P12931	3	A5
+P35555	0	C0
+P35222	0.9	D2
+Q05086	0	C2
+Q96EB6	0	C3
+P02649	0	B5