# HG changeset patch # User lnguyen # Date 1505480625 14400 # Node ID 8e3bb3153dc4f515125805b92578115af3b2bdf8 planemo upload diff -r 000000000000 -r 8e3bb3153dc4 filter_kw_val.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filter_kw_val.py Fri Sep 15 09:03:45 2017 -0400 @@ -0,0 +1,209 @@ +import argparse +import re + +def options(): + """ + Parse arguments: + -i, --input: Input file (text, tabular) + -m, --match: For keyword filter, if we filter for exact + --kw: keywords to filter out + --kw_file: file containing keywords to filter out + --value: value to filter + -o, --output: output filename, default is output.txt + --trash_file: extra output file containing removed lines + """ + parser = argparse.ArgumentParser() + parser.add_argument("-i", "--input", help="Input file", required=True) + parser.add_argument("-m", "--match", help="Exact macth") + parser.add_argument("--kw", nargs="+", action="append", help="") # + parser.add_argument("--kw_file", nargs="+", action="append", help="") + parser.add_argument("--value", nargs="+", action="append", help="") + parser.add_argument("-o", "--output", default="output.txt") + parser.add_argument("--trash_file", default="trash_MQfilter.txt") + + args = parser.parse_args() + + filters(args) + +def isnumber(format, n): + # Check if an element is integer or float + float_format = re.compile("^[\-]?[1-9][0-9]*\.?[0-9]+$") + int_format = re.compile("^[\-]?[1-9][0-9]*$") + test = "" + if format == "int": + test = re.match(int_format, n) + elif format == "float": + test = re.match(float_format, n) + if test: + return True + else: + return False + +def filters(args): + """ + Extract filter arguments + """ + + # Read input file + MQfilename = args.input.split(",")[0] + header = args.input.split(",")[1] + MQfile = readMQ(MQfilename) + results = [MQfile, None] + + # Extract keyword arguments + if args.kw: + keywords = args.kw + for k in keywords: + results = filter_keyword(results[0], header, results[1], k[0], k[1], k[2]) + if args.kw_file: + key_files = args.kw_file + for kf in key_files: + ids = readOption(kf[0]) + results = filter_keyword(results[0], header, results[1], ids, kf[1], kf[2]) + + # Extract value arguments + if args.value: + for v in args.value: + if isnumber("float", v[0]): + results = filter_value(results[0], header, results[1], v[0], v[1], v[2]) + else: + raise ValueError("Please enter a number in filter by value") + + # Write results to output + output = open(args.output, "w") + output.write("".join(results[0])) + output.close() + + # Write deleted lines to trash_file + trash = open(args.trash_file, "w") + trash.write("".join(results[1])) + trash.close() + +def readOption(filename): + """ + Read file containing keywords to filter out + """ + f = open(filename, "r") + file = f.read() + filter_list = file.split("\n") + filters = "" + for i in filter_list: + filters += i + ":" + filters = filters[:-1] + return filters + +def readMQ(MQfilename): + """ + Read input file and return list of file's lines + """ + # Read input file + mqfile = open(MQfilename, "r") + mq = mqfile.readlines() + # Remove empty lines (contain only space or new line or "") + [mq.remove(blank) for blank in mq if blank.isspace() or blank == ""] + return mq + +def filter_keyword(MQfile, header, filtered_lines, kws, ncol, match): + """ + Filter keywords + """ + mq = MQfile + + # Check if column number is in right form + if isnumber("int", ncol.replace("c", "")): + id_index = int(ncol.replace("c", "")) - 1 + else: + raise ValueError("Please specify the column where you would like to apply the filter with valid format") + + # Extract list of keywords to filter out + kws = kws.upper().split(":") + [kws.remove(blank) for blank in kws if blank.isspace() or blank == ""] + + # Separate header and content of input file + if header == "true": + header = mq[0] + content = mq[1:] + else: + header = "" + content = mq[:] + + # List of lines removed from input file + if not filtered_lines: # In case there is already some filtered lines from other filters + filtered_lines = [] + if header != "": + filtered_lines.append(header) + + # Filter out the lines containing keywords + for line in content: + id_inline = line.split("\t")[id_index].replace('"', "").split(";") + one_id_line = line.replace(line.split("\t")[id_index], id_inline[0]) # Take only first IDs + + if match != "false": + if any (pid.upper() in kws for pid in id_inline): + filtered_lines.append(one_id_line) + mq.remove(line) + else: + mq[mq.index(line)] = one_id_line + else: + if any (ft in pid.upper() for pid in id_inline for ft in kws): + filtered_lines.append(one_id_line) + mq.remove(line) + else: + mq[mq.index(line)] = one_id_line + return mq, filtered_lines + +def filter_value(MQfile, header, filtered_prots, filter_value, ncol, opt): + """ + Filter values + """ + mq = MQfile + + # Check if column number is in right form + if ncol and isnumber("int", ncol.replace("c", "")): #"Gene names" in columns: + index = int(ncol.replace("c", "")) - 1 #columns.index("Gene names") + else: + raise ValueError("Please specify the column where you would like to apply the filter with valid format") + + # Separate header and content of input file + if header == "true": + header = mq[0] + content = mq[1:] + else: + header = "" + content = mq[:] + + # List of lines removed from input file + if not filtered_prots: # In case there is already some filtered lines from other filters + filtered_prots = [] + if header != "": + filtered_prots.append(header) + + # Filter out the lines meet filter conditions + for prot in content: + filter_value = float(filter_value) + pep = prot.split("\t")[index].replace('"', "") + if pep.replace(".", "", 1).isdigit(): + if opt == "<": + if not float(pep) < filter_value: + filtered_prots.append(prot) + mq.remove(prot) + elif opt == "<=": + if not float(pep) <= filter_value: + filtered_prots.append(prot) + mq.remove(prot) + elif opt == ">": + if not float(pep) > filter_value: + filtered_prots.append(prot) + mq.remove(prot) + elif opt == ">=": + if not float(pep) >= filter_value: + filtered_prots.append(prot) + mq.remove(prot) + else: + if not float(pep) == filter_value: + filtered_prots.append(prot) + mq.remove(prot) + return mq, filtered_prots + +if __name__ == "__main__": + options() diff -r 000000000000 -r 8e3bb3153dc4 filter_kw_val.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filter_kw_val.xml Fri Sep 15 09:03:45 2017 -0400 @@ -0,0 +1,202 @@ + + Filter a file by keywords or values + + + + + + " + #else if $val.v.val == "Equal or higher" + $val.v.equal_higher "$val.v.ncol" ">=" + #else if $val.v.val == "Lower" + $val.v.lower "$val.v.ncol" "<" + #else + $val.v.equal_lower "$val.v.ncol" "<=" + #end if + #end if + #end for + + ]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ", ">=", "<" and "<=", then enter the value to filter and specify the column to apply that option. + +**Output** + +The tool will produce 2 output files. + +* A text file containing the resulting filtered input file. + +* A text file containing the rows removed from the input file. + +----- + +.. class:: infomark + +**Authors** + +T.P. Lien Nguyen, Florence Combes, Yves Vandenbrouck CEA, INSERM, CNRS, Grenoble-Alpes University, BIG Institute, FR +Sandra Dérozier, Olivier Rué, Christophe Caron, Valentin Loux INRA, Paris-Saclay University, MAIAGE Unit, Migale Bioinformatics platform + +This work has been partially funded through the French National Agency for Research (ANR) IFB project. + +Contact support@proteore.org for any questions or concerns about the Galaxy implementation of this tool. + + ]]> + + + diff -r 000000000000 -r 8e3bb3153dc4 test-data/UnipIDs.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/UnipIDs.txt Fri Sep 15 09:03:45 2017 -0400 @@ -0,0 +1,25 @@ +P04637 1 A0 +P08246 2 B0 +P63244 1.5 C1 +P10275 3 A2 +P00533 2 A3 +Q14524 3.5 D1 +P05067 1 B3 +P35555 0 C0 +P35222 0.9 D2 +O95273 1.1 A4 +P00451 2 B2 +P38398 5 B4 +Q05086 0 C2 +Q12802 3 D5 +P68871 1.5 B4 +P04585 2.5 D3 +Q96EB6 0 C3 +Q9NYL2 1 B1 +P31749 3 A1 +P01137 5 B6 +Q5S007 8 D4 +Q08379 2 C4 +P02649 0 B5 +P35498 1 C5 +P12931 3 A5 diff -r 000000000000 -r 8e3bb3153dc4 test-data/filter_keywords_values_output.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/filter_keywords_values_output.txt Fri Sep 15 09:03:45 2017 -0400 @@ -0,0 +1,14 @@ +P08246 2 B0 +P63244 1.5 C1 +Q14524 3.5 D1 +P05067 1 B3 +P00451 2 B2 +P38398 5 B4 +Q12802 3 D5 +P68871 1.5 B4 +P04585 2.5 D3 +Q9NYL2 1 B1 +P01137 5 B6 +Q5S007 8 D4 +Q08379 2 C4 +P35498 1 C5 diff -r 000000000000 -r 8e3bb3153dc4 test-data/filter_keywords_values_removed.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/filter_keywords_values_removed.txt Fri Sep 15 09:03:45 2017 -0400 @@ -0,0 +1,11 @@ +P04637 1 A0 +P10275 3 A2 +P00533 2 A3 +O95273 1.1 A4 +P31749 3 A1 +P12931 3 A5 +P35555 0 C0 +P35222 0.9 D2 +Q05086 0 C2 +Q96EB6 0 C3 +P02649 0 B5