Mercurial > repos > davidvanzessen > imgt_locus_split
view imgt_locus_split.py @ 1:418b7dbc8947 draft
Uploaded
author | davidvanzessen |
---|---|
date | Mon, 17 Jul 2017 08:54:02 -0400 |
parents | b00c257f0a67 |
children | 4bb8f6523130 |
line wrap: on
line source
import argparse import logging import os import re import shutil import sys import tarfile import tempfile import zipfile import magic imgt_file_regex = re.compile("^\d+_[^P]") def sniff_imgt_type(input_file): m = magic.Magic() file_type = m.from_file(input_file) logging.debug("File type of {0} is {1}".format(input_file, file_type)) return file_type.split(" ")[0] def unpack_imgt_zip(input_file, output_dir): imgt_type = sniff_imgt_type(input_file) if imgt_type == "Zip": with zipfile.ZipFile(input_file) as inf: inf.extractall(output_dir) elif imgt_type == "XZ": with tarfile.open(input_file) as inf: inf.extractall(output_dir) else: raise IOError("Unsuppported file type: {0}".format(imgt_type)) logging.debug("Extracted {0} to {1}".format(input_file, output_dir)) check = os.listdir(output_dir) if len(check) == 1: check = os.path.join(output_dir, check[0]) if os.path.isdir(check): logging.info("{0} is an older IMGT zip, removing extra dir".format(input_file)) files = os.listdir(check) for file in files: new_file = os.path.join(output_dir, file) file = os.path.join(check, file) shutil.move(file, new_file) shutil.rmtree(check) def filter_imgt_file(old_file, new_file, column, fltr): logging.debug("Filtering {0} with {1}".format(old_file, fltr)) first = True total = 0 remain = 0 with open(old_file, 'r') as of, open(new_file, 'w') as nf: column_index = -1 for line in of: splt = line.split("\t") if first: column_index = splt.index(column) first = False nf.write(line) continue total += 1 if len(splt) > column_index and splt[column_index].find(fltr) != -1: remain += 1 nf.write(line) return total, remain def all_same_in_list(l): return all(l[0] == x for x in l[1:]) def filter_imgt_dir(imgt_dir, locus): logging.info("Working on {0}".format(locus)) imgt_files = [f for f in os.listdir(imgt_dir) if imgt_file_regex.match(f)] tmp_file = os.path.join(imgt_dir, "tmp.txt") totals = [] remains = [] for imgt_file in imgt_files: imgt_file = os.path.join(imgt_dir, imgt_file) total, remain = filter_imgt_file(imgt_file, tmp_file, "V-GENE and allele", locus) totals.append(total) remains.append(remain) logging.debug("{0} rows, {1} after filtering".format(total, remain)) shutil.move(tmp_file, imgt_file) if not (all_same_in_list(totals) and all_same_in_list(remains)): logging.warning("Not all files had the same number of sequences remaining for {0}".format(imgt_dir)) return totals[0], remains[0] def make_new_xz_file(input_dir, output_file): logging.info("Creating new IMGT zip for {0} at {1}".format(input_dir, output_file)) imgt_files = [f for f in os.listdir(input_dir)] with tarfile.open(output_file, 'w:xz') as out: for imgt_file in imgt_files: logging.debug("Writing {0} to new IMGT zip".format(imgt_file)) imgt_file = os.path.join(input_dir, imgt_file) out.add(imgt_file, arcname=os.path.basename(imgt_file)) def main(): parser = argparse.ArgumentParser() parser.add_argument("--input", help="The input IMGT file", required=True) parser.add_argument("--output-ig", help="The output file for new IMGT ZIP with just IG sequences", default="None") parser.add_argument("--output-igh", help="The output file for new IMGT ZIP with just IGH sequences", default="None") parser.add_argument("--output-igk", help="The output file for new IMGT ZIP with just IGK sequences", default="None") parser.add_argument("--output-igl", help="The output file for new IMGT ZIP with just IGL sequences", default="None") parser.add_argument("--output-tr", help="The output file for new IMGT ZIP with just TR sequences", default="None") parser.add_argument("--output-tra", help="The output file for new IMGT ZIP with just TRA sequences", default="None") parser.add_argument("--output-trb", help="The output file for new IMGT ZIP with just TRB sequences", default="None") parser.add_argument("--output-trd", help="The output file for new IMGT ZIP with just TRD sequences", default="None") parser.add_argument("--output-trg", help="The output file for new IMGT ZIP with just TRG sequences", default="None") logging.basicConfig(filename="./log.html", level=logging.DEBUG, format="%(asctime)s: %(message)s <br />", datefmt='%Y/%m/%d %H:%M:%S') logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) logging.info("Started IMGT locus split") args = parser.parse_args() input_file = args.input output_ig = args.output_ig output_igh = args.output_igh output_igk = args.output_igk output_igl = args.output_igl output_tr = args.output_tr output_tra = args.output_tra output_trb = args.output_trb output_trd = args.output_trd output_trg = args.output_trg loci = { "IG": output_ig, "IGH": output_igh, "IGK": output_igk, "IGL": output_igl, "TR": output_tr, "TRA": output_tra, "TRB": output_trb, "TRD": output_trd, "TRG": output_trg } loci_to_filter = {} logging.debug("All Parameters:") logging.debug("Input: {0}".format(input_file)) for locus, path in loci.items(): logging.debug("{0}: {1}".format(locus, path)) if path != "None" and os.path.isdir(os.path.split(path)[0]): loci_to_filter[locus] = path if len(loci_to_filter) == 0: raise Exception("No locus selected, nothing to do") logging.info("Parameters:") for locus, path in loci_to_filter.items(): logging.info("{0}: {1}".format(locus, path)) work_dir = tempfile.mkdtemp() original_files_dir = os.path.join(work_dir, "original") os.mkdir(original_files_dir) unpack_imgt_zip(input_file, original_files_dir) for locus, path in loci_to_filter.items(): locus_dir = os.path.join(work_dir, locus) shutil.copytree(original_files_dir, locus_dir) total, remain = filter_imgt_dir(locus_dir, locus) logging.info("{0}\t{1}\t{2}\t{3}".format(locus, path, total, remain)) make_new_xz_file(locus_dir, loci_to_filter[locus]) if __name__ == "__main__": main()