# HG changeset patch
# User davidvanzessen
# Date 1551778907 18000
# Node ID 4bb8f65231307fa203b9d8f181a36d3bbd1f6c26
# Parent 418b7dbc8947e681aa383ebe51c84d91210b1868
Uploaded
diff -r 418b7dbc8947 -r 4bb8f6523130 imgt_locus_split.py
--- a/imgt_locus_split.py Mon Jul 17 08:54:02 2017 -0400
+++ b/imgt_locus_split.py Tue Mar 05 04:41:47 2019 -0500
@@ -14,18 +14,17 @@
def sniff_imgt_type(input_file):
- m = magic.Magic()
- file_type = m.from_file(input_file)
+ file_type = magic.from_file(input_file)
logging.debug("File type of {0} is {1}".format(input_file, file_type))
- return file_type.split(" ")[0]
+ return file_type
def unpack_imgt_zip(input_file, output_dir):
imgt_type = sniff_imgt_type(input_file)
- if imgt_type == "Zip":
+ if imgt_type.startswith("Zip"):
with zipfile.ZipFile(input_file) as inf:
inf.extractall(output_dir)
- elif imgt_type == "XZ":
+ elif imgt_type.startswith("XZ"):
with tarfile.open(input_file) as inf:
inf.extractall(output_dir)
else:
@@ -44,8 +43,8 @@
shutil.rmtree(check)
-def filter_imgt_file(old_file, new_file, column, fltr):
- logging.debug("Filtering {0} with {1}".format(old_file, fltr))
+def filter_tabular_file(old_file, new_file, column, regex):
+ logging.debug("Filtering {0} with {1}".format(old_file, regex.pattern))
first = True
total = 0
remain = 0
@@ -59,7 +58,7 @@
nf.write(line)
continue
total += 1
- if len(splt) > column_index and splt[column_index].find(fltr) != -1:
+ if len(splt) >= column_index and regex.search(splt[column_index]):
remain += 1
nf.write(line)
return total, remain
@@ -69,21 +68,22 @@
return all(l[0] == x for x in l[1:])
-def filter_imgt_dir(imgt_dir, locus):
- logging.info("Working on {0}".format(locus))
+def filter_imgt_dir(imgt_dir, loci):
+ logging.info("Filtering {0} with {1}".format(imgt_dir, loci))
imgt_files = [f for f in os.listdir(imgt_dir) if imgt_file_regex.match(f)]
tmp_file = os.path.join(imgt_dir, "tmp.txt")
totals = []
remains = []
+ loci_regex = re.compile("|".join(loci))
for imgt_file in imgt_files:
imgt_file = os.path.join(imgt_dir, imgt_file)
- total, remain = filter_imgt_file(imgt_file, tmp_file, "V-GENE and allele", locus)
+ total, remain = filter_tabular_file(imgt_file, tmp_file, "V-GENE and allele", loci_regex)
totals.append(total)
remains.append(remain)
logging.debug("{0} rows, {1} after filtering".format(total, remain))
shutil.move(tmp_file, imgt_file)
if not (all_same_in_list(totals) and all_same_in_list(remains)):
- logging.warning("Not all files had the same number of sequences remaining for {0}".format(imgt_dir))
+ logging.warning("Not all files had the same number of sequences remaining for {0}: {1}".format(imgt_dir, remains))
return totals[0], remains[0]
@@ -99,16 +99,9 @@
def main():
parser = argparse.ArgumentParser()
- parser.add_argument("--input", help="The input IMGT file", required=True)
- parser.add_argument("--output-ig", help="The output file for new IMGT ZIP with just IG sequences", default="None")
- parser.add_argument("--output-igh", help="The output file for new IMGT ZIP with just IGH sequences", default="None")
- parser.add_argument("--output-igk", help="The output file for new IMGT ZIP with just IGK sequences", default="None")
- parser.add_argument("--output-igl", help="The output file for new IMGT ZIP with just IGL sequences", default="None")
- parser.add_argument("--output-tr", help="The output file for new IMGT ZIP with just TR sequences", default="None")
- parser.add_argument("--output-tra", help="The output file for new IMGT ZIP with just TRA sequences", default="None")
- parser.add_argument("--output-trb", help="The output file for new IMGT ZIP with just TRB sequences", default="None")
- parser.add_argument("--output-trd", help="The output file for new IMGT ZIP with just TRD sequences", default="None")
- parser.add_argument("--output-trg", help="The output file for new IMGT ZIP with just TRG sequences", default="None")
+ parser.add_argument("-i", "--input", help="The input IMGT file", required=True)
+ parser.add_argument("-l", "--loci", help="The Loci to filter on", required=True)
+ parser.add_argument("-o", "--output", help="The output file for the new IMGT zip with just the filtered sequences", required=True)
logging.basicConfig(filename="./log.html", level=logging.DEBUG, format="%(asctime)s: %(message)s
",
datefmt='%Y/%m/%d %H:%M:%S')
@@ -117,57 +110,27 @@
args = parser.parse_args()
input_file = args.input
- output_ig = args.output_ig
- output_igh = args.output_igh
- output_igk = args.output_igk
- output_igl = args.output_igl
- output_tr = args.output_tr
- output_tra = args.output_tra
- output_trb = args.output_trb
- output_trd = args.output_trd
- output_trg = args.output_trg
-
- loci = {
- "IG": output_ig,
- "IGH": output_igh,
- "IGK": output_igk,
- "IGL": output_igl,
- "TR": output_tr,
- "TRA": output_tra,
- "TRB": output_trb,
- "TRD": output_trd,
- "TRG": output_trg
- }
-
- loci_to_filter = {}
+ loci = args.loci.split(",")
+ output_file = args.output
logging.debug("All Parameters:")
logging.debug("Input: {0}".format(input_file))
- for locus, path in loci.items():
- logging.debug("{0}: {1}".format(locus, path))
- if path != "None" and os.path.isdir(os.path.split(path)[0]):
- loci_to_filter[locus] = path
+ logging.debug("Loci: {0}".format(loci))
+ logging.debug("Output: {0}".format(output_file))
- if len(loci_to_filter) == 0:
+ if len(loci) == 0:
raise Exception("No locus selected, nothing to do")
- logging.info("Parameters:")
- for locus, path in loci_to_filter.items():
- logging.info("{0}: {1}".format(locus, path))
-
work_dir = tempfile.mkdtemp()
original_files_dir = os.path.join(work_dir, "original")
os.mkdir(original_files_dir)
unpack_imgt_zip(input_file, original_files_dir)
- for locus, path in loci_to_filter.items():
- locus_dir = os.path.join(work_dir, locus)
- shutil.copytree(original_files_dir, locus_dir)
- total, remain = filter_imgt_dir(locus_dir, locus)
- logging.info("{0}\t{1}\t{2}\t{3}".format(locus, path, total, remain))
+ total, remain = filter_imgt_dir(original_files_dir, loci)
+ logging.info("{0}\t{1}".format(total, remain))
- make_new_xz_file(locus_dir, loci_to_filter[locus])
+ make_new_xz_file(original_files_dir, output_file)
if __name__ == "__main__":
diff -r 418b7dbc8947 -r 4bb8f6523130 imgt_locus_split.xml
--- a/imgt_locus_split.xml Mon Jul 17 08:54:02 2017 -0400
+++ b/imgt_locus_split.xml Tue Mar 05 04:41:47 2019 -0500
@@ -1,19 +1,12 @@
- python-magic
+ python-magic
@@ -30,33 +23,7 @@
-
- loci.__contains__('IG')
-
-
- loci.__contains__('IGH')
-
-
- loci.__contains__('IGK')
-
-
- loci.__contains__('IGL')
-
-
- loci.__contains__('TR')
-
-
- loci.__contains__('TRA')
-
-
- loci.__contains__('TRB')
-
-
- loci.__contains__('TRD')
-
-
- loci.__contains__('TRG')
-
+