Mercurial > repos > davidvanzessen > imgt_locus_split
changeset 2:4bb8f6523130 draft
Uploaded
author | davidvanzessen |
---|---|
date | Tue, 05 Mar 2019 04:41:47 -0500 |
parents | 418b7dbc8947 |
children | 9aa036caa8e9 |
files | imgt_locus_split.py imgt_locus_split.xml |
diffstat | 2 files changed, 28 insertions(+), 98 deletions(-) [+] |
line wrap: on
line diff
--- a/imgt_locus_split.py Mon Jul 17 08:54:02 2017 -0400 +++ b/imgt_locus_split.py Tue Mar 05 04:41:47 2019 -0500 @@ -14,18 +14,17 @@ def sniff_imgt_type(input_file): - m = magic.Magic() - file_type = m.from_file(input_file) + file_type = magic.from_file(input_file) logging.debug("File type of {0} is {1}".format(input_file, file_type)) - return file_type.split(" ")[0] + return file_type def unpack_imgt_zip(input_file, output_dir): imgt_type = sniff_imgt_type(input_file) - if imgt_type == "Zip": + if imgt_type.startswith("Zip"): with zipfile.ZipFile(input_file) as inf: inf.extractall(output_dir) - elif imgt_type == "XZ": + elif imgt_type.startswith("XZ"): with tarfile.open(input_file) as inf: inf.extractall(output_dir) else: @@ -44,8 +43,8 @@ shutil.rmtree(check) -def filter_imgt_file(old_file, new_file, column, fltr): - logging.debug("Filtering {0} with {1}".format(old_file, fltr)) +def filter_tabular_file(old_file, new_file, column, regex): + logging.debug("Filtering {0} with {1}".format(old_file, regex.pattern)) first = True total = 0 remain = 0 @@ -59,7 +58,7 @@ nf.write(line) continue total += 1 - if len(splt) > column_index and splt[column_index].find(fltr) != -1: + if len(splt) >= column_index and regex.search(splt[column_index]): remain += 1 nf.write(line) return total, remain @@ -69,21 +68,22 @@ return all(l[0] == x for x in l[1:]) -def filter_imgt_dir(imgt_dir, locus): - logging.info("Working on {0}".format(locus)) +def filter_imgt_dir(imgt_dir, loci): + logging.info("Filtering {0} with {1}".format(imgt_dir, loci)) imgt_files = [f for f in os.listdir(imgt_dir) if imgt_file_regex.match(f)] tmp_file = os.path.join(imgt_dir, "tmp.txt") totals = [] remains = [] + loci_regex = re.compile("|".join(loci)) for imgt_file in imgt_files: imgt_file = os.path.join(imgt_dir, imgt_file) - total, remain = filter_imgt_file(imgt_file, tmp_file, "V-GENE and allele", locus) + total, remain = filter_tabular_file(imgt_file, tmp_file, "V-GENE and allele", loci_regex) totals.append(total) remains.append(remain) logging.debug("{0} rows, {1} after filtering".format(total, remain)) shutil.move(tmp_file, imgt_file) if not (all_same_in_list(totals) and all_same_in_list(remains)): - logging.warning("Not all files had the same number of sequences remaining for {0}".format(imgt_dir)) + logging.warning("Not all files had the same number of sequences remaining for {0}: {1}".format(imgt_dir, remains)) return totals[0], remains[0] @@ -99,16 +99,9 @@ def main(): parser = argparse.ArgumentParser() - parser.add_argument("--input", help="The input IMGT file", required=True) - parser.add_argument("--output-ig", help="The output file for new IMGT ZIP with just IG sequences", default="None") - parser.add_argument("--output-igh", help="The output file for new IMGT ZIP with just IGH sequences", default="None") - parser.add_argument("--output-igk", help="The output file for new IMGT ZIP with just IGK sequences", default="None") - parser.add_argument("--output-igl", help="The output file for new IMGT ZIP with just IGL sequences", default="None") - parser.add_argument("--output-tr", help="The output file for new IMGT ZIP with just TR sequences", default="None") - parser.add_argument("--output-tra", help="The output file for new IMGT ZIP with just TRA sequences", default="None") - parser.add_argument("--output-trb", help="The output file for new IMGT ZIP with just TRB sequences", default="None") - parser.add_argument("--output-trd", help="The output file for new IMGT ZIP with just TRD sequences", default="None") - parser.add_argument("--output-trg", help="The output file for new IMGT ZIP with just TRG sequences", default="None") + parser.add_argument("-i", "--input", help="The input IMGT file", required=True) + parser.add_argument("-l", "--loci", help="The Loci to filter on", required=True) + parser.add_argument("-o", "--output", help="The output file for the new IMGT zip with just the filtered sequences", required=True) logging.basicConfig(filename="./log.html", level=logging.DEBUG, format="%(asctime)s: %(message)s <br />", datefmt='%Y/%m/%d %H:%M:%S') @@ -117,57 +110,27 @@ args = parser.parse_args() input_file = args.input - output_ig = args.output_ig - output_igh = args.output_igh - output_igk = args.output_igk - output_igl = args.output_igl - output_tr = args.output_tr - output_tra = args.output_tra - output_trb = args.output_trb - output_trd = args.output_trd - output_trg = args.output_trg - - loci = { - "IG": output_ig, - "IGH": output_igh, - "IGK": output_igk, - "IGL": output_igl, - "TR": output_tr, - "TRA": output_tra, - "TRB": output_trb, - "TRD": output_trd, - "TRG": output_trg - } - - loci_to_filter = {} + loci = args.loci.split(",") + output_file = args.output logging.debug("All Parameters:") logging.debug("Input: {0}".format(input_file)) - for locus, path in loci.items(): - logging.debug("{0}: {1}".format(locus, path)) - if path != "None" and os.path.isdir(os.path.split(path)[0]): - loci_to_filter[locus] = path + logging.debug("Loci: {0}".format(loci)) + logging.debug("Output: {0}".format(output_file)) - if len(loci_to_filter) == 0: + if len(loci) == 0: raise Exception("No locus selected, nothing to do") - logging.info("Parameters:") - for locus, path in loci_to_filter.items(): - logging.info("{0}: {1}".format(locus, path)) - work_dir = tempfile.mkdtemp() original_files_dir = os.path.join(work_dir, "original") os.mkdir(original_files_dir) unpack_imgt_zip(input_file, original_files_dir) - for locus, path in loci_to_filter.items(): - locus_dir = os.path.join(work_dir, locus) - shutil.copytree(original_files_dir, locus_dir) - total, remain = filter_imgt_dir(locus_dir, locus) - logging.info("{0}\t{1}\t{2}\t{3}".format(locus, path, total, remain)) + total, remain = filter_imgt_dir(original_files_dir, loci) + logging.info("{0}\t{1}".format(total, remain)) - make_new_xz_file(locus_dir, loci_to_filter[locus]) + make_new_xz_file(original_files_dir, output_file) if __name__ == "__main__":
--- a/imgt_locus_split.xml Mon Jul 17 08:54:02 2017 -0400 +++ b/imgt_locus_split.xml Tue Mar 05 04:41:47 2019 -0500 @@ -1,19 +1,12 @@ <tool id="imgt_locus_split" name="IMGT Locus Split" version="0.1"> <requirements> - <requirement type="package" version="0.4.13">python-magic</requirement> + <requirement type="package" version="0.4.15">python-magic</requirement> </requirements> <command detect_errors="exit_code"><![CDATA[ python3 $__tool_directory__/imgt_locus_split.py - --input $input - --output-ig ${output_ig} - --output-igh ${output_igh} - --output-igk ${output_igk} - --output-igl ${output_igl} - --output-tr ${output_tr} - --output-tra ${output_tra} - --output-trb ${output_trb} - --output-trd ${output_trd} - --output-trg ${output_trg} + --input $input + --loci $loci + --output $output ]]></command> <inputs> <param name="input" type="data" format="data" label="The IMGT zip file to be split"/> @@ -30,33 +23,7 @@ </param> </inputs> <outputs> - <data format="imgt_archive" name="output_ig" label="Filtered IMGT IG: ${input.name}"> - <filter>loci.__contains__('IG')</filter> - </data> - <data format="imgt_archive" name="output_igh" label="Filtered IMGT IGH: ${input.name}"> - <filter>loci.__contains__('IGH')</filter> - </data> - <data format="imgt_archive" name="output_igk" label="Filtered IMGT IGK: ${input.name}"> - <filter>loci.__contains__('IGK')</filter> - </data> - <data format="imgt_archive" name="output_igl" label="Filtered IMGT IGL: ${input.name}"> - <filter>loci.__contains__('IGL')</filter> - </data> - <data format="imgt_archive" name="output_tr" label="Filtered IMGT TR: ${input.name}"> - <filter>loci.__contains__('TR')</filter> - </data> - <data format="imgt_archive" name="output_tra" label="Filtered IMGT TRA: ${input.name}"> - <filter>loci.__contains__('TRA')</filter> - </data> - <data format="imgt_archive" name="output_trb" label="Filtered IMGT TRB: ${input.name}"> - <filter>loci.__contains__('TRB')</filter> - </data> - <data format="imgt_archive" name="output_trd" label="Filtered IMGT TRD: ${input.name}"> - <filter>loci.__contains__('TRD')</filter> - </data> - <data format="imgt_archive" name="output_trg" label="Filtered IMGT TRG: ${input.name}"> - <filter>loci.__contains__('TRG')</filter> - </data> + <data format="imgt_archive" name="output" label="${input.name} $loci" /> </outputs> <help><![CDATA[ Creates a new IMGT zip file for every checked locus with sequences from only that locus.