Mercurial > repos > davidvanzessen > imgt_locus_split
changeset 0:b00c257f0a67 draft
Uploaded
author | davidvanzessen |
---|---|
date | Thu, 13 Jul 2017 10:24:39 -0400 |
parents | |
children | 418b7dbc8947 |
files | imgt_locus_split.py imgt_locus_split.xml |
diffstat | 2 files changed, 172 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/imgt_locus_split.py Thu Jul 13 10:24:39 2017 -0400 @@ -0,0 +1,138 @@ +import argparse +import logging +import os +import re +import shutil +import sys +import tarfile +import tempfile +import zipfile + +import magic + +imgt_file_regex = re.compile("^\d+_[^P]") + + +def sniff_imgt_type(input_file): + m = magic.Magic() + file_type = m.from_file(input_file) + logging.debug("File type of {0} is {1}".format(input_file, file_type)) + return file_type.split(" ")[0] + + +def unpack_imgt_zip(input_file, output_dir): + imgt_type = sniff_imgt_type(input_file) + if imgt_type == "Zip": + with zipfile.ZipFile(input_file) as inf: + inf.extractall(output_dir) + elif imgt_type == "XZ": + with tarfile.open(input_file) as inf: + inf.extractall(output_dir) + else: + raise IOError("Unsuppported file type: {0}".format(imgt_type)) + logging.debug("Extracted {0} to {1}".format(input_file, output_dir)) + check = os.listdir(output_dir) + if len(check) == 1: + check = os.path.join(output_dir, check[0]) + if os.path.isdir(check): + logging.info("{0} is an older IMGT zip, removing extra dir".format(input_file)) + files = os.listdir(check) + for file in files: + new_file = os.path.join(output_dir, file) + file = os.path.join(check, file) + shutil.move(file, new_file) + shutil.rmtree(check) + + +def filter_tabular_file(old_file, new_file, column, regex): + logging.debug("Filtering {0} with {1}".format(old_file, regex.pattern)) + first = True + total = 0 + remain = 0 + with open(old_file, 'r') as of, open(new_file, 'w') as nf: + column_index = -1 + for line in of: + splt = line.split("\t") + if first: + column_index = splt.index(column) + first = False + nf.write(line) + continue + total += 1 + if len(splt) >= column_index and regex.search(splt[column_index]): + remain += 1 + nf.write(line) + return total, remain + + +def all_same_in_list(l): + return all(l[0] == x for x in l[1:]) + + +def filter_imgt_dir(imgt_dir, loci): + logging.info("Filtering {0} with {1}".format(imgt_dir, loci)) + imgt_files = [f for f in os.listdir(imgt_dir) if imgt_file_regex.match(f)] + tmp_file = os.path.join(imgt_dir, "tmp.txt") + totals = [] + remains = [] + loci_regex = re.compile("|".join(loci)) + for imgt_file in imgt_files: + imgt_file = os.path.join(imgt_dir, imgt_file) + total, remain = filter_tabular_file(imgt_file, tmp_file, "V-GENE and allele", loci_regex) + totals.append(total) + remains.append(remain) + logging.debug("{0} rows, {1} after filtering".format(total, remain)) + shutil.move(tmp_file, imgt_file) + if not (all_same_in_list(totals) and all_same_in_list(remains)): + logging.warning("Not all files had the same number of sequences remaining for {0}: {1}".format(imgt_dir, remains)) + return totals[0], remains[0] + + +def make_new_xz_file(input_dir, output_file): + logging.info("Creating new IMGT zip for {0} at {1}".format(input_dir, output_file)) + imgt_files = [f for f in os.listdir(input_dir)] + with tarfile.open(output_file, 'w:xz') as out: + for imgt_file in imgt_files: + logging.debug("Writing {0} to new IMGT zip".format(imgt_file)) + imgt_file = os.path.join(input_dir, imgt_file) + out.add(imgt_file, arcname=os.path.basename(imgt_file)) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("-i", "--input", help="The input IMGT file", required=True) + parser.add_argument("-l", "--loci", help="The Loci to filter on", required=True) + parser.add_argument("-o", "--output", help="The output file for the new IMGT zip with just the filtered sequences", required=True) + + logging.basicConfig(filename="./log.html", level=logging.DEBUG, format="%(asctime)s: %(message)s <br />", + datefmt='%Y/%m/%d %H:%M:%S') + logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) + logging.info("Started IMGT locus split") + + args = parser.parse_args() + input_file = args.input + loci = args.loci.split(",") + output_file = args.output + + logging.debug("All Parameters:") + logging.debug("Input: {0}".format(input_file)) + logging.debug("Loci: {0}".format(loci)) + logging.debug("Output: {0}".format(output_file)) + + if len(loci) == 0: + raise Exception("No locus selected, nothing to do") + + work_dir = tempfile.mkdtemp() + original_files_dir = os.path.join(work_dir, "original") + os.mkdir(original_files_dir) + + unpack_imgt_zip(input_file, original_files_dir) + + total, remain = filter_imgt_dir(original_files_dir, loci) + logging.info("{0}\t{1}".format(total, remain)) + + make_new_xz_file(original_files_dir, output_file) + + +if __name__ == "__main__": + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/imgt_locus_split.xml Thu Jul 13 10:24:39 2017 -0400 @@ -0,0 +1,34 @@ +<tool id="imgt_locus_split" name="IMGT Locus Split" version="0.1"> + <requirements> + <requirement type="package" version="0.4.13">python-magic</requirement> + </requirements> + <command detect_errors="exit_code"><![CDATA[ + python3 $__tool_directory__/imgt_locus_split.py + --input $input + --loci $loci + --output $output + ]]></command> + <inputs> + <param name="input" type="data" format="data" label="The IMGT zip file to be split"/> + <param name="loci" type="select" label="Loci" multiple="true" display="checkboxes"> + <option value="IG" selected="true">IG</option> + <option value="IGH">IGH</option> + <option value="IGK">IGK</option> + <option value="IGL">IGL</option> + <option value="TR" selected="true">TR</option> + <option value="TRA">TRA</option> + <option value="TRB">TRB</option> + <option value="TRD">TRD</option> + <option value="TRG">TRG</option> + </param> + </inputs> + <outputs> + <data format="imgt_archive" name="output" label="${input.name} $loci" /> + </outputs> + <help><![CDATA[ + Creates a new IMGT zip file for every checked locus with sequences from only that locus. + ]]></help> + <citations> + <citation type="doi">10.1093/nar/gku1056</citation> + </citations> +</tool> \ No newline at end of file