Mercurial > repos > davidvanzessen > imgt_locus_split
comparison imgt_locus_split.py @ 2:4bb8f6523130 draft
Uploaded
author | davidvanzessen |
---|---|
date | Tue, 05 Mar 2019 04:41:47 -0500 |
parents | 418b7dbc8947 |
children |
comparison
equal
deleted
inserted
replaced
1:418b7dbc8947 | 2:4bb8f6523130 |
---|---|
12 | 12 |
13 imgt_file_regex = re.compile("^\d+_[^P]") | 13 imgt_file_regex = re.compile("^\d+_[^P]") |
14 | 14 |
15 | 15 |
16 def sniff_imgt_type(input_file): | 16 def sniff_imgt_type(input_file): |
17 m = magic.Magic() | 17 file_type = magic.from_file(input_file) |
18 file_type = m.from_file(input_file) | |
19 logging.debug("File type of {0} is {1}".format(input_file, file_type)) | 18 logging.debug("File type of {0} is {1}".format(input_file, file_type)) |
20 return file_type.split(" ")[0] | 19 return file_type |
21 | 20 |
22 | 21 |
23 def unpack_imgt_zip(input_file, output_dir): | 22 def unpack_imgt_zip(input_file, output_dir): |
24 imgt_type = sniff_imgt_type(input_file) | 23 imgt_type = sniff_imgt_type(input_file) |
25 if imgt_type == "Zip": | 24 if imgt_type.startswith("Zip"): |
26 with zipfile.ZipFile(input_file) as inf: | 25 with zipfile.ZipFile(input_file) as inf: |
27 inf.extractall(output_dir) | 26 inf.extractall(output_dir) |
28 elif imgt_type == "XZ": | 27 elif imgt_type.startswith("XZ"): |
29 with tarfile.open(input_file) as inf: | 28 with tarfile.open(input_file) as inf: |
30 inf.extractall(output_dir) | 29 inf.extractall(output_dir) |
31 else: | 30 else: |
32 raise IOError("Unsuppported file type: {0}".format(imgt_type)) | 31 raise IOError("Unsuppported file type: {0}".format(imgt_type)) |
33 logging.debug("Extracted {0} to {1}".format(input_file, output_dir)) | 32 logging.debug("Extracted {0} to {1}".format(input_file, output_dir)) |
42 file = os.path.join(check, file) | 41 file = os.path.join(check, file) |
43 shutil.move(file, new_file) | 42 shutil.move(file, new_file) |
44 shutil.rmtree(check) | 43 shutil.rmtree(check) |
45 | 44 |
46 | 45 |
47 def filter_imgt_file(old_file, new_file, column, fltr): | 46 def filter_tabular_file(old_file, new_file, column, regex): |
48 logging.debug("Filtering {0} with {1}".format(old_file, fltr)) | 47 logging.debug("Filtering {0} with {1}".format(old_file, regex.pattern)) |
49 first = True | 48 first = True |
50 total = 0 | 49 total = 0 |
51 remain = 0 | 50 remain = 0 |
52 with open(old_file, 'r') as of, open(new_file, 'w') as nf: | 51 with open(old_file, 'r') as of, open(new_file, 'w') as nf: |
53 column_index = -1 | 52 column_index = -1 |
57 column_index = splt.index(column) | 56 column_index = splt.index(column) |
58 first = False | 57 first = False |
59 nf.write(line) | 58 nf.write(line) |
60 continue | 59 continue |
61 total += 1 | 60 total += 1 |
62 if len(splt) > column_index and splt[column_index].find(fltr) != -1: | 61 if len(splt) >= column_index and regex.search(splt[column_index]): |
63 remain += 1 | 62 remain += 1 |
64 nf.write(line) | 63 nf.write(line) |
65 return total, remain | 64 return total, remain |
66 | 65 |
67 | 66 |
68 def all_same_in_list(l): | 67 def all_same_in_list(l): |
69 return all(l[0] == x for x in l[1:]) | 68 return all(l[0] == x for x in l[1:]) |
70 | 69 |
71 | 70 |
72 def filter_imgt_dir(imgt_dir, locus): | 71 def filter_imgt_dir(imgt_dir, loci): |
73 logging.info("Working on {0}".format(locus)) | 72 logging.info("Filtering {0} with {1}".format(imgt_dir, loci)) |
74 imgt_files = [f for f in os.listdir(imgt_dir) if imgt_file_regex.match(f)] | 73 imgt_files = [f for f in os.listdir(imgt_dir) if imgt_file_regex.match(f)] |
75 tmp_file = os.path.join(imgt_dir, "tmp.txt") | 74 tmp_file = os.path.join(imgt_dir, "tmp.txt") |
76 totals = [] | 75 totals = [] |
77 remains = [] | 76 remains = [] |
77 loci_regex = re.compile("|".join(loci)) | |
78 for imgt_file in imgt_files: | 78 for imgt_file in imgt_files: |
79 imgt_file = os.path.join(imgt_dir, imgt_file) | 79 imgt_file = os.path.join(imgt_dir, imgt_file) |
80 total, remain = filter_imgt_file(imgt_file, tmp_file, "V-GENE and allele", locus) | 80 total, remain = filter_tabular_file(imgt_file, tmp_file, "V-GENE and allele", loci_regex) |
81 totals.append(total) | 81 totals.append(total) |
82 remains.append(remain) | 82 remains.append(remain) |
83 logging.debug("{0} rows, {1} after filtering".format(total, remain)) | 83 logging.debug("{0} rows, {1} after filtering".format(total, remain)) |
84 shutil.move(tmp_file, imgt_file) | 84 shutil.move(tmp_file, imgt_file) |
85 if not (all_same_in_list(totals) and all_same_in_list(remains)): | 85 if not (all_same_in_list(totals) and all_same_in_list(remains)): |
86 logging.warning("Not all files had the same number of sequences remaining for {0}".format(imgt_dir)) | 86 logging.warning("Not all files had the same number of sequences remaining for {0}: {1}".format(imgt_dir, remains)) |
87 return totals[0], remains[0] | 87 return totals[0], remains[0] |
88 | 88 |
89 | 89 |
90 def make_new_xz_file(input_dir, output_file): | 90 def make_new_xz_file(input_dir, output_file): |
91 logging.info("Creating new IMGT zip for {0} at {1}".format(input_dir, output_file)) | 91 logging.info("Creating new IMGT zip for {0} at {1}".format(input_dir, output_file)) |
97 out.add(imgt_file, arcname=os.path.basename(imgt_file)) | 97 out.add(imgt_file, arcname=os.path.basename(imgt_file)) |
98 | 98 |
99 | 99 |
100 def main(): | 100 def main(): |
101 parser = argparse.ArgumentParser() | 101 parser = argparse.ArgumentParser() |
102 parser.add_argument("--input", help="The input IMGT file", required=True) | 102 parser.add_argument("-i", "--input", help="The input IMGT file", required=True) |
103 parser.add_argument("--output-ig", help="The output file for new IMGT ZIP with just IG sequences", default="None") | 103 parser.add_argument("-l", "--loci", help="The Loci to filter on", required=True) |
104 parser.add_argument("--output-igh", help="The output file for new IMGT ZIP with just IGH sequences", default="None") | 104 parser.add_argument("-o", "--output", help="The output file for the new IMGT zip with just the filtered sequences", required=True) |
105 parser.add_argument("--output-igk", help="The output file for new IMGT ZIP with just IGK sequences", default="None") | |
106 parser.add_argument("--output-igl", help="The output file for new IMGT ZIP with just IGL sequences", default="None") | |
107 parser.add_argument("--output-tr", help="The output file for new IMGT ZIP with just TR sequences", default="None") | |
108 parser.add_argument("--output-tra", help="The output file for new IMGT ZIP with just TRA sequences", default="None") | |
109 parser.add_argument("--output-trb", help="The output file for new IMGT ZIP with just TRB sequences", default="None") | |
110 parser.add_argument("--output-trd", help="The output file for new IMGT ZIP with just TRD sequences", default="None") | |
111 parser.add_argument("--output-trg", help="The output file for new IMGT ZIP with just TRG sequences", default="None") | |
112 | 105 |
113 logging.basicConfig(filename="./log.html", level=logging.DEBUG, format="%(asctime)s: %(message)s <br />", | 106 logging.basicConfig(filename="./log.html", level=logging.DEBUG, format="%(asctime)s: %(message)s <br />", |
114 datefmt='%Y/%m/%d %H:%M:%S') | 107 datefmt='%Y/%m/%d %H:%M:%S') |
115 logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) | 108 logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) |
116 logging.info("Started IMGT locus split") | 109 logging.info("Started IMGT locus split") |
117 | 110 |
118 args = parser.parse_args() | 111 args = parser.parse_args() |
119 input_file = args.input | 112 input_file = args.input |
120 output_ig = args.output_ig | 113 loci = args.loci.split(",") |
121 output_igh = args.output_igh | 114 output_file = args.output |
122 output_igk = args.output_igk | |
123 output_igl = args.output_igl | |
124 output_tr = args.output_tr | |
125 output_tra = args.output_tra | |
126 output_trb = args.output_trb | |
127 output_trd = args.output_trd | |
128 output_trg = args.output_trg | |
129 | |
130 loci = { | |
131 "IG": output_ig, | |
132 "IGH": output_igh, | |
133 "IGK": output_igk, | |
134 "IGL": output_igl, | |
135 "TR": output_tr, | |
136 "TRA": output_tra, | |
137 "TRB": output_trb, | |
138 "TRD": output_trd, | |
139 "TRG": output_trg | |
140 } | |
141 | |
142 loci_to_filter = {} | |
143 | 115 |
144 logging.debug("All Parameters:") | 116 logging.debug("All Parameters:") |
145 logging.debug("Input: {0}".format(input_file)) | 117 logging.debug("Input: {0}".format(input_file)) |
146 for locus, path in loci.items(): | 118 logging.debug("Loci: {0}".format(loci)) |
147 logging.debug("{0}: {1}".format(locus, path)) | 119 logging.debug("Output: {0}".format(output_file)) |
148 if path != "None" and os.path.isdir(os.path.split(path)[0]): | |
149 loci_to_filter[locus] = path | |
150 | 120 |
151 if len(loci_to_filter) == 0: | 121 if len(loci) == 0: |
152 raise Exception("No locus selected, nothing to do") | 122 raise Exception("No locus selected, nothing to do") |
153 | |
154 logging.info("Parameters:") | |
155 for locus, path in loci_to_filter.items(): | |
156 logging.info("{0}: {1}".format(locus, path)) | |
157 | 123 |
158 work_dir = tempfile.mkdtemp() | 124 work_dir = tempfile.mkdtemp() |
159 original_files_dir = os.path.join(work_dir, "original") | 125 original_files_dir = os.path.join(work_dir, "original") |
160 os.mkdir(original_files_dir) | 126 os.mkdir(original_files_dir) |
161 | 127 |
162 unpack_imgt_zip(input_file, original_files_dir) | 128 unpack_imgt_zip(input_file, original_files_dir) |
163 | 129 |
164 for locus, path in loci_to_filter.items(): | 130 total, remain = filter_imgt_dir(original_files_dir, loci) |
165 locus_dir = os.path.join(work_dir, locus) | 131 logging.info("{0}\t{1}".format(total, remain)) |
166 shutil.copytree(original_files_dir, locus_dir) | |
167 total, remain = filter_imgt_dir(locus_dir, locus) | |
168 logging.info("{0}\t{1}\t{2}\t{3}".format(locus, path, total, remain)) | |
169 | 132 |
170 make_new_xz_file(locus_dir, loci_to_filter[locus]) | 133 make_new_xz_file(original_files_dir, output_file) |
171 | 134 |
172 | 135 |
173 if __name__ == "__main__": | 136 if __name__ == "__main__": |
174 main() | 137 main() |