Mercurial > repos > lnguyen > filter_keywords_values
view filter_kw_val.py @ 0:8e3bb3153dc4 draft default tip
planemo upload
author | lnguyen |
---|---|
date | Fri, 15 Sep 2017 09:03:45 -0400 |
parents | |
children |
line wrap: on
line source
import argparse import re def options(): """ Parse arguments: -i, --input: Input file (text, tabular) -m, --match: For keyword filter, if we filter for exact --kw: keywords to filter out --kw_file: file containing keywords to filter out --value: value to filter -o, --output: output filename, default is output.txt --trash_file: extra output file containing removed lines """ parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", help="Input file", required=True) parser.add_argument("-m", "--match", help="Exact macth") parser.add_argument("--kw", nargs="+", action="append", help="") # parser.add_argument("--kw_file", nargs="+", action="append", help="") parser.add_argument("--value", nargs="+", action="append", help="") parser.add_argument("-o", "--output", default="output.txt") parser.add_argument("--trash_file", default="trash_MQfilter.txt") args = parser.parse_args() filters(args) def isnumber(format, n): # Check if an element is integer or float float_format = re.compile("^[\-]?[1-9][0-9]*\.?[0-9]+$") int_format = re.compile("^[\-]?[1-9][0-9]*$") test = "" if format == "int": test = re.match(int_format, n) elif format == "float": test = re.match(float_format, n) if test: return True else: return False def filters(args): """ Extract filter arguments """ # Read input file MQfilename = args.input.split(",")[0] header = args.input.split(",")[1] MQfile = readMQ(MQfilename) results = [MQfile, None] # Extract keyword arguments if args.kw: keywords = args.kw for k in keywords: results = filter_keyword(results[0], header, results[1], k[0], k[1], k[2]) if args.kw_file: key_files = args.kw_file for kf in key_files: ids = readOption(kf[0]) results = filter_keyword(results[0], header, results[1], ids, kf[1], kf[2]) # Extract value arguments if args.value: for v in args.value: if isnumber("float", v[0]): results = filter_value(results[0], header, results[1], v[0], v[1], v[2]) else: raise ValueError("Please enter a number in filter by value") # Write results to output output = open(args.output, "w") output.write("".join(results[0])) output.close() # Write deleted lines to trash_file trash = open(args.trash_file, "w") trash.write("".join(results[1])) trash.close() def readOption(filename): """ Read file containing keywords to filter out """ f = open(filename, "r") file = f.read() filter_list = file.split("\n") filters = "" for i in filter_list: filters += i + ":" filters = filters[:-1] return filters def readMQ(MQfilename): """ Read input file and return list of file's lines """ # Read input file mqfile = open(MQfilename, "r") mq = mqfile.readlines() # Remove empty lines (contain only space or new line or "") [mq.remove(blank) for blank in mq if blank.isspace() or blank == ""] return mq def filter_keyword(MQfile, header, filtered_lines, kws, ncol, match): """ Filter keywords """ mq = MQfile # Check if column number is in right form if isnumber("int", ncol.replace("c", "")): id_index = int(ncol.replace("c", "")) - 1 else: raise ValueError("Please specify the column where you would like to apply the filter with valid format") # Extract list of keywords to filter out kws = kws.upper().split(":") [kws.remove(blank) for blank in kws if blank.isspace() or blank == ""] # Separate header and content of input file if header == "true": header = mq[0] content = mq[1:] else: header = "" content = mq[:] # List of lines removed from input file if not filtered_lines: # In case there is already some filtered lines from other filters filtered_lines = [] if header != "": filtered_lines.append(header) # Filter out the lines containing keywords for line in content: id_inline = line.split("\t")[id_index].replace('"', "").split(";") one_id_line = line.replace(line.split("\t")[id_index], id_inline[0]) # Take only first IDs if match != "false": if any (pid.upper() in kws for pid in id_inline): filtered_lines.append(one_id_line) mq.remove(line) else: mq[mq.index(line)] = one_id_line else: if any (ft in pid.upper() for pid in id_inline for ft in kws): filtered_lines.append(one_id_line) mq.remove(line) else: mq[mq.index(line)] = one_id_line return mq, filtered_lines def filter_value(MQfile, header, filtered_prots, filter_value, ncol, opt): """ Filter values """ mq = MQfile # Check if column number is in right form if ncol and isnumber("int", ncol.replace("c", "")): #"Gene names" in columns: index = int(ncol.replace("c", "")) - 1 #columns.index("Gene names") else: raise ValueError("Please specify the column where you would like to apply the filter with valid format") # Separate header and content of input file if header == "true": header = mq[0] content = mq[1:] else: header = "" content = mq[:] # List of lines removed from input file if not filtered_prots: # In case there is already some filtered lines from other filters filtered_prots = [] if header != "": filtered_prots.append(header) # Filter out the lines meet filter conditions for prot in content: filter_value = float(filter_value) pep = prot.split("\t")[index].replace('"', "") if pep.replace(".", "", 1).isdigit(): if opt == "<": if not float(pep) < filter_value: filtered_prots.append(prot) mq.remove(prot) elif opt == "<=": if not float(pep) <= filter_value: filtered_prots.append(prot) mq.remove(prot) elif opt == ">": if not float(pep) > filter_value: filtered_prots.append(prot) mq.remove(prot) elif opt == ">=": if not float(pep) >= filter_value: filtered_prots.append(prot) mq.remove(prot) else: if not float(pep) == filter_value: filtered_prots.append(prot) mq.remove(prot) return mq, filtered_prots if __name__ == "__main__": options()