Mercurial > repos > davidvanzessen > imgt_locus_split

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/imgt_locus_split.py	Thu Jul 13 10:24:39 2017 -0400
@@ -0,0 +1,138 @@
+import argparse
+import logging
+import os
+import re
+import shutil
+import sys
+import tarfile
+import tempfile
+import zipfile
+
+import magic
+
+imgt_file_regex = re.compile("^\d+_[^P]")
+
+
+def sniff_imgt_type(input_file):
+    m = magic.Magic()
+    file_type = m.from_file(input_file)
+    logging.debug("File type of {0} is {1}".format(input_file, file_type))
+    return file_type.split(" ")[0]
+
+
+def unpack_imgt_zip(input_file, output_dir):
+    imgt_type = sniff_imgt_type(input_file)
+    if imgt_type == "Zip":
+        with zipfile.ZipFile(input_file) as inf:
+            inf.extractall(output_dir)
+    elif imgt_type == "XZ":
+        with tarfile.open(input_file) as inf:
+            inf.extractall(output_dir)
+    else:
+        raise IOError("Unsuppported file type: {0}".format(imgt_type))
+    logging.debug("Extracted {0} to {1}".format(input_file, output_dir))
+    check = os.listdir(output_dir)
+    if len(check) == 1:
+        check = os.path.join(output_dir, check[0])
+        if os.path.isdir(check):
+            logging.info("{0} is an older IMGT zip, removing extra dir".format(input_file))
+            files = os.listdir(check)
+            for file in files:
+                new_file = os.path.join(output_dir, file)
+                file = os.path.join(check, file)
+                shutil.move(file, new_file)
+            shutil.rmtree(check)
+
+
+def filter_tabular_file(old_file, new_file, column, regex):
+    logging.debug("Filtering {0} with {1}".format(old_file, regex.pattern))
+    first = True
+    total = 0
+    remain = 0
+    with open(old_file, 'r') as of, open(new_file, 'w') as nf:
+        column_index = -1
+        for line in of:
+            splt = line.split("\t")
+            if first:
+                column_index = splt.index(column)
+                first = False
+                nf.write(line)
+                continue
+            total += 1
+            if len(splt) >= column_index and regex.search(splt[column_index]):
+                remain += 1
+                nf.write(line)
+    return total, remain
+
+
+def all_same_in_list(l):
+    return all(l[0] == x for x in l[1:])
+
+
+def filter_imgt_dir(imgt_dir, loci):
+    logging.info("Filtering {0} with {1}".format(imgt_dir, loci))
+    imgt_files = [f for f in os.listdir(imgt_dir) if imgt_file_regex.match(f)]
+    tmp_file = os.path.join(imgt_dir, "tmp.txt")
+    totals = []
+    remains = []
+    loci_regex = re.compile("|".join(loci))
+    for imgt_file in imgt_files:
+        imgt_file = os.path.join(imgt_dir, imgt_file)
+        total, remain = filter_tabular_file(imgt_file, tmp_file, "V-GENE and allele", loci_regex)
+        totals.append(total)
+        remains.append(remain)
+        logging.debug("{0} rows, {1} after filtering".format(total, remain))
+        shutil.move(tmp_file, imgt_file)
+    if not (all_same_in_list(totals) and all_same_in_list(remains)):
+        logging.warning("Not all files had the same number of sequences remaining for {0}: {1}".format(imgt_dir, remains))
+    return totals[0], remains[0]
+
+
+def make_new_xz_file(input_dir, output_file):
+    logging.info("Creating new IMGT zip for {0} at {1}".format(input_dir, output_file))
+    imgt_files = [f for f in os.listdir(input_dir)]
+    with tarfile.open(output_file, 'w:xz') as out:
+        for imgt_file in imgt_files:
+            logging.debug("Writing {0} to new IMGT zip".format(imgt_file))
+            imgt_file = os.path.join(input_dir, imgt_file)
+            out.add(imgt_file, arcname=os.path.basename(imgt_file))
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-i", "--input", help="The input IMGT file", required=True)
+    parser.add_argument("-l", "--loci", help="The Loci to filter on", required=True)
+    parser.add_argument("-o", "--output", help="The output file for the new IMGT zip with just the filtered sequences", required=True)
+
+    logging.basicConfig(filename="./log.html", level=logging.DEBUG, format="%(asctime)s:&emsp;%(message)s <br />",
+                        datefmt='%Y/%m/%d %H:%M:%S')
+    logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
+    logging.info("Started IMGT locus split")
+
+    args = parser.parse_args()
+    input_file = args.input
+    loci = args.loci.split(",")
+    output_file = args.output
+
+    logging.debug("All Parameters:")
+    logging.debug("Input: {0}".format(input_file))
+    logging.debug("Loci: {0}".format(loci))
+    logging.debug("Output: {0}".format(output_file))
+
+    if len(loci) == 0:
+        raise Exception("No locus selected, nothing to do")
+
+    work_dir = tempfile.mkdtemp()
+    original_files_dir = os.path.join(work_dir, "original")
+    os.mkdir(original_files_dir)
+
+    unpack_imgt_zip(input_file, original_files_dir)
+
+    total, remain = filter_imgt_dir(original_files_dir, loci)
+    logging.info("{0}\t{1}".format(total, remain))
+
+    make_new_xz_file(original_files_dir, output_file)
+
+
+if __name__ == "__main__":
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/imgt_locus_split.xml	Thu Jul 13 10:24:39 2017 -0400
@@ -0,0 +1,34 @@
+<tool id="imgt_locus_split" name="IMGT Locus Split" version="0.1">
+    <requirements>
+        <requirement type="package" version="0.4.13">python-magic</requirement>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+        python3 $__tool_directory__/imgt_locus_split.py
+            --input $input
+            --loci $loci
+            --output $output
+    ]]></command>
+    <inputs>
+        <param name="input" type="data" format="data" label="The IMGT zip file to be split"/>
+        <param name="loci" type="select" label="Loci" multiple="true" display="checkboxes">
+            <option value="IG" selected="true">IG</option>
+            <option value="IGH">IGH</option>
+            <option value="IGK">IGK</option>
+            <option value="IGL">IGL</option>
+            <option value="TR" selected="true">TR</option>
+            <option value="TRA">TRA</option>
+            <option value="TRB">TRB</option>
+            <option value="TRD">TRD</option>
+            <option value="TRG">TRG</option>
+        </param>
+    </inputs>
+    <outputs>
+        <data format="imgt_archive" name="output" label="${input.name} $loci" />
+    </outputs>
+    <help><![CDATA[
+        Creates a new IMGT zip file for every checked locus with sequences from only that locus.
+    ]]></help>
+    <citations>
+        <citation type="doi">10.1093/nar/gku1056</citation>
+    </citations>
+</tool>
\ No newline at end of file