comparison imgt_locus_split.py @ 2:4bb8f6523130 draft

Uploaded
author davidvanzessen
date Tue, 05 Mar 2019 04:41:47 -0500
parents 418b7dbc8947
children
comparison
equal deleted inserted replaced
1:418b7dbc8947 2:4bb8f6523130
12 12
13 imgt_file_regex = re.compile("^\d+_[^P]") 13 imgt_file_regex = re.compile("^\d+_[^P]")
14 14
15 15
16 def sniff_imgt_type(input_file): 16 def sniff_imgt_type(input_file):
17 m = magic.Magic() 17 file_type = magic.from_file(input_file)
18 file_type = m.from_file(input_file)
19 logging.debug("File type of {0} is {1}".format(input_file, file_type)) 18 logging.debug("File type of {0} is {1}".format(input_file, file_type))
20 return file_type.split(" ")[0] 19 return file_type
21 20
22 21
23 def unpack_imgt_zip(input_file, output_dir): 22 def unpack_imgt_zip(input_file, output_dir):
24 imgt_type = sniff_imgt_type(input_file) 23 imgt_type = sniff_imgt_type(input_file)
25 if imgt_type == "Zip": 24 if imgt_type.startswith("Zip"):
26 with zipfile.ZipFile(input_file) as inf: 25 with zipfile.ZipFile(input_file) as inf:
27 inf.extractall(output_dir) 26 inf.extractall(output_dir)
28 elif imgt_type == "XZ": 27 elif imgt_type.startswith("XZ"):
29 with tarfile.open(input_file) as inf: 28 with tarfile.open(input_file) as inf:
30 inf.extractall(output_dir) 29 inf.extractall(output_dir)
31 else: 30 else:
32 raise IOError("Unsuppported file type: {0}".format(imgt_type)) 31 raise IOError("Unsuppported file type: {0}".format(imgt_type))
33 logging.debug("Extracted {0} to {1}".format(input_file, output_dir)) 32 logging.debug("Extracted {0} to {1}".format(input_file, output_dir))
42 file = os.path.join(check, file) 41 file = os.path.join(check, file)
43 shutil.move(file, new_file) 42 shutil.move(file, new_file)
44 shutil.rmtree(check) 43 shutil.rmtree(check)
45 44
46 45
47 def filter_imgt_file(old_file, new_file, column, fltr): 46 def filter_tabular_file(old_file, new_file, column, regex):
48 logging.debug("Filtering {0} with {1}".format(old_file, fltr)) 47 logging.debug("Filtering {0} with {1}".format(old_file, regex.pattern))
49 first = True 48 first = True
50 total = 0 49 total = 0
51 remain = 0 50 remain = 0
52 with open(old_file, 'r') as of, open(new_file, 'w') as nf: 51 with open(old_file, 'r') as of, open(new_file, 'w') as nf:
53 column_index = -1 52 column_index = -1
57 column_index = splt.index(column) 56 column_index = splt.index(column)
58 first = False 57 first = False
59 nf.write(line) 58 nf.write(line)
60 continue 59 continue
61 total += 1 60 total += 1
62 if len(splt) > column_index and splt[column_index].find(fltr) != -1: 61 if len(splt) >= column_index and regex.search(splt[column_index]):
63 remain += 1 62 remain += 1
64 nf.write(line) 63 nf.write(line)
65 return total, remain 64 return total, remain
66 65
67 66
68 def all_same_in_list(l): 67 def all_same_in_list(l):
69 return all(l[0] == x for x in l[1:]) 68 return all(l[0] == x for x in l[1:])
70 69
71 70
72 def filter_imgt_dir(imgt_dir, locus): 71 def filter_imgt_dir(imgt_dir, loci):
73 logging.info("Working on {0}".format(locus)) 72 logging.info("Filtering {0} with {1}".format(imgt_dir, loci))
74 imgt_files = [f for f in os.listdir(imgt_dir) if imgt_file_regex.match(f)] 73 imgt_files = [f for f in os.listdir(imgt_dir) if imgt_file_regex.match(f)]
75 tmp_file = os.path.join(imgt_dir, "tmp.txt") 74 tmp_file = os.path.join(imgt_dir, "tmp.txt")
76 totals = [] 75 totals = []
77 remains = [] 76 remains = []
77 loci_regex = re.compile("|".join(loci))
78 for imgt_file in imgt_files: 78 for imgt_file in imgt_files:
79 imgt_file = os.path.join(imgt_dir, imgt_file) 79 imgt_file = os.path.join(imgt_dir, imgt_file)
80 total, remain = filter_imgt_file(imgt_file, tmp_file, "V-GENE and allele", locus) 80 total, remain = filter_tabular_file(imgt_file, tmp_file, "V-GENE and allele", loci_regex)
81 totals.append(total) 81 totals.append(total)
82 remains.append(remain) 82 remains.append(remain)
83 logging.debug("{0} rows, {1} after filtering".format(total, remain)) 83 logging.debug("{0} rows, {1} after filtering".format(total, remain))
84 shutil.move(tmp_file, imgt_file) 84 shutil.move(tmp_file, imgt_file)
85 if not (all_same_in_list(totals) and all_same_in_list(remains)): 85 if not (all_same_in_list(totals) and all_same_in_list(remains)):
86 logging.warning("Not all files had the same number of sequences remaining for {0}".format(imgt_dir)) 86 logging.warning("Not all files had the same number of sequences remaining for {0}: {1}".format(imgt_dir, remains))
87 return totals[0], remains[0] 87 return totals[0], remains[0]
88 88
89 89
90 def make_new_xz_file(input_dir, output_file): 90 def make_new_xz_file(input_dir, output_file):
91 logging.info("Creating new IMGT zip for {0} at {1}".format(input_dir, output_file)) 91 logging.info("Creating new IMGT zip for {0} at {1}".format(input_dir, output_file))
97 out.add(imgt_file, arcname=os.path.basename(imgt_file)) 97 out.add(imgt_file, arcname=os.path.basename(imgt_file))
98 98
99 99
100 def main(): 100 def main():
101 parser = argparse.ArgumentParser() 101 parser = argparse.ArgumentParser()
102 parser.add_argument("--input", help="The input IMGT file", required=True) 102 parser.add_argument("-i", "--input", help="The input IMGT file", required=True)
103 parser.add_argument("--output-ig", help="The output file for new IMGT ZIP with just IG sequences", default="None") 103 parser.add_argument("-l", "--loci", help="The Loci to filter on", required=True)
104 parser.add_argument("--output-igh", help="The output file for new IMGT ZIP with just IGH sequences", default="None") 104 parser.add_argument("-o", "--output", help="The output file for the new IMGT zip with just the filtered sequences", required=True)
105 parser.add_argument("--output-igk", help="The output file for new IMGT ZIP with just IGK sequences", default="None")
106 parser.add_argument("--output-igl", help="The output file for new IMGT ZIP with just IGL sequences", default="None")
107 parser.add_argument("--output-tr", help="The output file for new IMGT ZIP with just TR sequences", default="None")
108 parser.add_argument("--output-tra", help="The output file for new IMGT ZIP with just TRA sequences", default="None")
109 parser.add_argument("--output-trb", help="The output file for new IMGT ZIP with just TRB sequences", default="None")
110 parser.add_argument("--output-trd", help="The output file for new IMGT ZIP with just TRD sequences", default="None")
111 parser.add_argument("--output-trg", help="The output file for new IMGT ZIP with just TRG sequences", default="None")
112 105
113 logging.basicConfig(filename="./log.html", level=logging.DEBUG, format="%(asctime)s:&emsp;%(message)s <br />", 106 logging.basicConfig(filename="./log.html", level=logging.DEBUG, format="%(asctime)s:&emsp;%(message)s <br />",
114 datefmt='%Y/%m/%d %H:%M:%S') 107 datefmt='%Y/%m/%d %H:%M:%S')
115 logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) 108 logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
116 logging.info("Started IMGT locus split") 109 logging.info("Started IMGT locus split")
117 110
118 args = parser.parse_args() 111 args = parser.parse_args()
119 input_file = args.input 112 input_file = args.input
120 output_ig = args.output_ig 113 loci = args.loci.split(",")
121 output_igh = args.output_igh 114 output_file = args.output
122 output_igk = args.output_igk
123 output_igl = args.output_igl
124 output_tr = args.output_tr
125 output_tra = args.output_tra
126 output_trb = args.output_trb
127 output_trd = args.output_trd
128 output_trg = args.output_trg
129
130 loci = {
131 "IG": output_ig,
132 "IGH": output_igh,
133 "IGK": output_igk,
134 "IGL": output_igl,
135 "TR": output_tr,
136 "TRA": output_tra,
137 "TRB": output_trb,
138 "TRD": output_trd,
139 "TRG": output_trg
140 }
141
142 loci_to_filter = {}
143 115
144 logging.debug("All Parameters:") 116 logging.debug("All Parameters:")
145 logging.debug("Input: {0}".format(input_file)) 117 logging.debug("Input: {0}".format(input_file))
146 for locus, path in loci.items(): 118 logging.debug("Loci: {0}".format(loci))
147 logging.debug("{0}: {1}".format(locus, path)) 119 logging.debug("Output: {0}".format(output_file))
148 if path != "None" and os.path.isdir(os.path.split(path)[0]):
149 loci_to_filter[locus] = path
150 120
151 if len(loci_to_filter) == 0: 121 if len(loci) == 0:
152 raise Exception("No locus selected, nothing to do") 122 raise Exception("No locus selected, nothing to do")
153
154 logging.info("Parameters:")
155 for locus, path in loci_to_filter.items():
156 logging.info("{0}: {1}".format(locus, path))
157 123
158 work_dir = tempfile.mkdtemp() 124 work_dir = tempfile.mkdtemp()
159 original_files_dir = os.path.join(work_dir, "original") 125 original_files_dir = os.path.join(work_dir, "original")
160 os.mkdir(original_files_dir) 126 os.mkdir(original_files_dir)
161 127
162 unpack_imgt_zip(input_file, original_files_dir) 128 unpack_imgt_zip(input_file, original_files_dir)
163 129
164 for locus, path in loci_to_filter.items(): 130 total, remain = filter_imgt_dir(original_files_dir, loci)
165 locus_dir = os.path.join(work_dir, locus) 131 logging.info("{0}\t{1}".format(total, remain))
166 shutil.copytree(original_files_dir, locus_dir)
167 total, remain = filter_imgt_dir(locus_dir, locus)
168 logging.info("{0}\t{1}\t{2}\t{3}".format(locus, path, total, remain))
169 132
170 make_new_xz_file(locus_dir, loci_to_filter[locus]) 133 make_new_xz_file(original_files_dir, output_file)
171 134
172 135
173 if __name__ == "__main__": 136 if __name__ == "__main__":
174 main() 137 main()