# HG changeset patch # User davidvanzessen # Date 1500296042 14400 # Node ID 418b7dbc8947e681aa383ebe51c84d91210b1868 # Parent b00c257f0a671629e2c7b94e403d6188c832105b Uploaded diff -r b00c257f0a67 -r 418b7dbc8947 imgt_locus_split.py --- a/imgt_locus_split.py Thu Jul 13 10:24:39 2017 -0400 +++ b/imgt_locus_split.py Mon Jul 17 08:54:02 2017 -0400 @@ -44,8 +44,8 @@ shutil.rmtree(check) -def filter_tabular_file(old_file, new_file, column, regex): - logging.debug("Filtering {0} with {1}".format(old_file, regex.pattern)) +def filter_imgt_file(old_file, new_file, column, fltr): + logging.debug("Filtering {0} with {1}".format(old_file, fltr)) first = True total = 0 remain = 0 @@ -59,7 +59,7 @@ nf.write(line) continue total += 1 - if len(splt) >= column_index and regex.search(splt[column_index]): + if len(splt) > column_index and splt[column_index].find(fltr) != -1: remain += 1 nf.write(line) return total, remain @@ -69,22 +69,21 @@ return all(l[0] == x for x in l[1:]) -def filter_imgt_dir(imgt_dir, loci): - logging.info("Filtering {0} with {1}".format(imgt_dir, loci)) +def filter_imgt_dir(imgt_dir, locus): + logging.info("Working on {0}".format(locus)) imgt_files = [f for f in os.listdir(imgt_dir) if imgt_file_regex.match(f)] tmp_file = os.path.join(imgt_dir, "tmp.txt") totals = [] remains = [] - loci_regex = re.compile("|".join(loci)) for imgt_file in imgt_files: imgt_file = os.path.join(imgt_dir, imgt_file) - total, remain = filter_tabular_file(imgt_file, tmp_file, "V-GENE and allele", loci_regex) + total, remain = filter_imgt_file(imgt_file, tmp_file, "V-GENE and allele", locus) totals.append(total) remains.append(remain) logging.debug("{0} rows, {1} after filtering".format(total, remain)) shutil.move(tmp_file, imgt_file) if not (all_same_in_list(totals) and all_same_in_list(remains)): - logging.warning("Not all files had the same number of sequences remaining for {0}: {1}".format(imgt_dir, remains)) + logging.warning("Not all files had the same number of sequences remaining for {0}".format(imgt_dir)) return totals[0], remains[0] @@ -100,9 +99,16 @@ def main(): parser = argparse.ArgumentParser() - parser.add_argument("-i", "--input", help="The input IMGT file", required=True) - parser.add_argument("-l", "--loci", help="The Loci to filter on", required=True) - parser.add_argument("-o", "--output", help="The output file for the new IMGT zip with just the filtered sequences", required=True) + parser.add_argument("--input", help="The input IMGT file", required=True) + parser.add_argument("--output-ig", help="The output file for new IMGT ZIP with just IG sequences", default="None") + parser.add_argument("--output-igh", help="The output file for new IMGT ZIP with just IGH sequences", default="None") + parser.add_argument("--output-igk", help="The output file for new IMGT ZIP with just IGK sequences", default="None") + parser.add_argument("--output-igl", help="The output file for new IMGT ZIP with just IGL sequences", default="None") + parser.add_argument("--output-tr", help="The output file for new IMGT ZIP with just TR sequences", default="None") + parser.add_argument("--output-tra", help="The output file for new IMGT ZIP with just TRA sequences", default="None") + parser.add_argument("--output-trb", help="The output file for new IMGT ZIP with just TRB sequences", default="None") + parser.add_argument("--output-trd", help="The output file for new IMGT ZIP with just TRD sequences", default="None") + parser.add_argument("--output-trg", help="The output file for new IMGT ZIP with just TRG sequences", default="None") logging.basicConfig(filename="./log.html", level=logging.DEBUG, format="%(asctime)s: %(message)s
", datefmt='%Y/%m/%d %H:%M:%S') @@ -111,27 +117,57 @@ args = parser.parse_args() input_file = args.input - loci = args.loci.split(",") - output_file = args.output + output_ig = args.output_ig + output_igh = args.output_igh + output_igk = args.output_igk + output_igl = args.output_igl + output_tr = args.output_tr + output_tra = args.output_tra + output_trb = args.output_trb + output_trd = args.output_trd + output_trg = args.output_trg + + loci = { + "IG": output_ig, + "IGH": output_igh, + "IGK": output_igk, + "IGL": output_igl, + "TR": output_tr, + "TRA": output_tra, + "TRB": output_trb, + "TRD": output_trd, + "TRG": output_trg + } + + loci_to_filter = {} logging.debug("All Parameters:") logging.debug("Input: {0}".format(input_file)) - logging.debug("Loci: {0}".format(loci)) - logging.debug("Output: {0}".format(output_file)) + for locus, path in loci.items(): + logging.debug("{0}: {1}".format(locus, path)) + if path != "None" and os.path.isdir(os.path.split(path)[0]): + loci_to_filter[locus] = path - if len(loci) == 0: + if len(loci_to_filter) == 0: raise Exception("No locus selected, nothing to do") + logging.info("Parameters:") + for locus, path in loci_to_filter.items(): + logging.info("{0}: {1}".format(locus, path)) + work_dir = tempfile.mkdtemp() original_files_dir = os.path.join(work_dir, "original") os.mkdir(original_files_dir) unpack_imgt_zip(input_file, original_files_dir) - total, remain = filter_imgt_dir(original_files_dir, loci) - logging.info("{0}\t{1}".format(total, remain)) + for locus, path in loci_to_filter.items(): + locus_dir = os.path.join(work_dir, locus) + shutil.copytree(original_files_dir, locus_dir) + total, remain = filter_imgt_dir(locus_dir, locus) + logging.info("{0}\t{1}\t{2}\t{3}".format(locus, path, total, remain)) - make_new_xz_file(original_files_dir, output_file) + make_new_xz_file(locus_dir, loci_to_filter[locus]) if __name__ == "__main__": diff -r b00c257f0a67 -r 418b7dbc8947 imgt_locus_split.xml --- a/imgt_locus_split.xml Thu Jul 13 10:24:39 2017 -0400 +++ b/imgt_locus_split.xml Mon Jul 17 08:54:02 2017 -0400 @@ -4,9 +4,16 @@ @@ -23,7 +30,33 @@ - + + loci.__contains__('IG') + + + loci.__contains__('IGH') + + + loci.__contains__('IGK') + + + loci.__contains__('IGL') + + + loci.__contains__('TR') + + + loci.__contains__('TRA') + + + loci.__contains__('TRB') + + + loci.__contains__('TRD') + + + loci.__contains__('TRG') +