view imgt_locus_split.py @ 1:418b7dbc8947 draft

Uploaded
author davidvanzessen
date Mon, 17 Jul 2017 08:54:02 -0400
parents b00c257f0a67
children 4bb8f6523130
line wrap: on
line source

import argparse
import logging
import os
import re
import shutil
import sys
import tarfile
import tempfile
import zipfile

import magic

imgt_file_regex = re.compile("^\d+_[^P]")


def sniff_imgt_type(input_file):
    m = magic.Magic()
    file_type = m.from_file(input_file)
    logging.debug("File type of {0} is {1}".format(input_file, file_type))
    return file_type.split(" ")[0]


def unpack_imgt_zip(input_file, output_dir):
    imgt_type = sniff_imgt_type(input_file)
    if imgt_type == "Zip":
        with zipfile.ZipFile(input_file) as inf:
            inf.extractall(output_dir)
    elif imgt_type == "XZ":
        with tarfile.open(input_file) as inf:
            inf.extractall(output_dir)
    else:
        raise IOError("Unsuppported file type: {0}".format(imgt_type))
    logging.debug("Extracted {0} to {1}".format(input_file, output_dir))
    check = os.listdir(output_dir)
    if len(check) == 1:
        check = os.path.join(output_dir, check[0])
        if os.path.isdir(check):
            logging.info("{0} is an older IMGT zip, removing extra dir".format(input_file))
            files = os.listdir(check)
            for file in files:
                new_file = os.path.join(output_dir, file)
                file = os.path.join(check, file)
                shutil.move(file, new_file)
            shutil.rmtree(check)


def filter_imgt_file(old_file, new_file, column, fltr):
    logging.debug("Filtering {0} with {1}".format(old_file, fltr))
    first = True
    total = 0
    remain = 0
    with open(old_file, 'r') as of, open(new_file, 'w') as nf:
        column_index = -1
        for line in of:
            splt = line.split("\t")
            if first:
                column_index = splt.index(column)
                first = False
                nf.write(line)
                continue
            total += 1
            if len(splt) > column_index and splt[column_index].find(fltr) != -1:
                remain += 1
                nf.write(line)
    return total, remain


def all_same_in_list(l):
    return all(l[0] == x for x in l[1:])


def filter_imgt_dir(imgt_dir, locus):
    logging.info("Working on {0}".format(locus))
    imgt_files = [f for f in os.listdir(imgt_dir) if imgt_file_regex.match(f)]
    tmp_file = os.path.join(imgt_dir, "tmp.txt")
    totals = []
    remains = []
    for imgt_file in imgt_files:
        imgt_file = os.path.join(imgt_dir, imgt_file)
        total, remain = filter_imgt_file(imgt_file, tmp_file, "V-GENE and allele", locus)
        totals.append(total)
        remains.append(remain)
        logging.debug("{0} rows, {1} after filtering".format(total, remain))
        shutil.move(tmp_file, imgt_file)
    if not (all_same_in_list(totals) and all_same_in_list(remains)):
        logging.warning("Not all files had the same number of sequences remaining for {0}".format(imgt_dir))
    return totals[0], remains[0]


def make_new_xz_file(input_dir, output_file):
    logging.info("Creating new IMGT zip for {0} at {1}".format(input_dir, output_file))
    imgt_files = [f for f in os.listdir(input_dir)]
    with tarfile.open(output_file, 'w:xz') as out:
        for imgt_file in imgt_files:
            logging.debug("Writing {0} to new IMGT zip".format(imgt_file))
            imgt_file = os.path.join(input_dir, imgt_file)
            out.add(imgt_file, arcname=os.path.basename(imgt_file))


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--input", help="The input IMGT file", required=True)
    parser.add_argument("--output-ig", help="The output file for new IMGT ZIP with just IG sequences", default="None")
    parser.add_argument("--output-igh", help="The output file for new IMGT ZIP with just IGH sequences", default="None")
    parser.add_argument("--output-igk", help="The output file for new IMGT ZIP with just IGK sequences", default="None")
    parser.add_argument("--output-igl", help="The output file for new IMGT ZIP with just IGL sequences", default="None")
    parser.add_argument("--output-tr", help="The output file for new IMGT ZIP with just TR sequences", default="None")
    parser.add_argument("--output-tra", help="The output file for new IMGT ZIP with just TRA sequences", default="None")
    parser.add_argument("--output-trb", help="The output file for new IMGT ZIP with just TRB sequences", default="None")
    parser.add_argument("--output-trd", help="The output file for new IMGT ZIP with just TRD sequences", default="None")
    parser.add_argument("--output-trg", help="The output file for new IMGT ZIP with just TRG sequences", default="None")

    logging.basicConfig(filename="./log.html", level=logging.DEBUG, format="%(asctime)s:&emsp;%(message)s <br />",
                        datefmt='%Y/%m/%d %H:%M:%S')
    logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
    logging.info("Started IMGT locus split")

    args = parser.parse_args()
    input_file = args.input
    output_ig = args.output_ig
    output_igh = args.output_igh
    output_igk = args.output_igk
    output_igl = args.output_igl
    output_tr = args.output_tr
    output_tra = args.output_tra
    output_trb = args.output_trb
    output_trd = args.output_trd
    output_trg = args.output_trg

    loci = {
        "IG": output_ig,
        "IGH": output_igh,
        "IGK": output_igk,
        "IGL": output_igl,
        "TR": output_tr,
        "TRA": output_tra,
        "TRB": output_trb,
        "TRD": output_trd,
        "TRG": output_trg
    }

    loci_to_filter = {}

    logging.debug("All Parameters:")
    logging.debug("Input: {0}".format(input_file))
    for locus, path in loci.items():
        logging.debug("{0}: {1}".format(locus, path))
        if path != "None" and os.path.isdir(os.path.split(path)[0]):
            loci_to_filter[locus] = path

    if len(loci_to_filter) == 0:
        raise Exception("No locus selected, nothing to do")

    logging.info("Parameters:")
    for locus, path in loci_to_filter.items():
        logging.info("{0}: {1}".format(locus, path))

    work_dir = tempfile.mkdtemp()
    original_files_dir = os.path.join(work_dir, "original")
    os.mkdir(original_files_dir)

    unpack_imgt_zip(input_file, original_files_dir)

    for locus, path in loci_to_filter.items():
        locus_dir = os.path.join(work_dir, locus)
        shutil.copytree(original_files_dir, locus_dir)
        total, remain = filter_imgt_dir(locus_dir, locus)
        logging.info("{0}\t{1}\t{2}\t{3}".format(locus, path, total, remain))

        make_new_xz_file(locus_dir, loci_to_filter[locus])


if __name__ == "__main__":
    main()