Mercurial > repos > davidvanzessen > demultiplex_emc
comparison demultiplex.py @ 0:36c79869620b draft
Uploaded
author | davidvanzessen |
---|---|
date | Fri, 09 Nov 2018 05:37:11 -0500 |
parents | |
children | b6d63b9efb8f |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:36c79869620b |
---|---|
1 import argparse | |
2 import csv | |
3 import logging | |
4 import os | |
5 import sys | |
6 from collections import defaultdict | |
7 from collections import namedtuple | |
8 | |
9 from Bio import SeqIO | |
10 from Bio.Alphabet import generic_dna | |
11 from Bio.Seq import Seq | |
12 from Bio.SeqRecord import SeqRecord | |
13 | |
14 | |
15 def sniff_format(file_path): | |
16 """ | |
17 Try to guess the file format (fastq/fasta) by looking at the first character of the first line. | |
18 Should be '@' for fastq and '>' for fasta. | |
19 """ | |
20 with open(file_path, 'rU') as file_handle: | |
21 for line in file_handle: | |
22 if line.startswith("@"): | |
23 return "fastq" | |
24 if line.startswith(">"): | |
25 return "fasta" | |
26 break | |
27 return None | |
28 | |
29 | |
30 def search_barcode_in_first_half(sequence, barcode): | |
31 if type(sequence) is Seq: | |
32 sequence = str(sequence) | |
33 elif type(sequence) is SeqRecord: | |
34 sequence = str(sequence.seq) | |
35 return sequence.find(barcode, 0, int(len(sequence) / 2)) | |
36 | |
37 | |
38 def search_barcode_in_second_half(sequence, barcode): | |
39 if type(sequence) is Seq: | |
40 sequence = str(sequence) | |
41 elif type(sequence) is SeqRecord: | |
42 sequence = str(sequence.seq) | |
43 return sequence.find(barcode, int(len(sequence) / 2)) | |
44 | |
45 | |
46 def search_barcodes_in_sequence(barcode_datas, sequence): | |
47 for barcode_data in barcode_datas: | |
48 barcode_search_position = search_barcode_in_first_half(sequence, barcode_data.barcode) | |
49 if barcode_search_position != -1: | |
50 return barcode_search_position, barcode_data, False | |
51 barcode_search_position = search_barcode_in_second_half(sequence, barcode_data.barcode_reverse) | |
52 if barcode_search_position != -1: | |
53 return barcode_search_position, barcode_data, True | |
54 | |
55 barcode_search_position = search_barcode_in_first_half(sequence, barcode_data.barcode_reverse) | |
56 if barcode_search_position != -1: | |
57 return barcode_search_position, barcode_data, True | |
58 | |
59 barcode_search_position = search_barcode_in_second_half(sequence, barcode_data.barcode) | |
60 if barcode_search_position != -1: | |
61 return barcode_search_position, barcode_data, False | |
62 | |
63 return -1, None, None | |
64 | |
65 | |
66 def main(): | |
67 parser = argparse.ArgumentParser() | |
68 parser.add_argument("-i", "--input", help="The input file") | |
69 parser.add_argument("-f", "--format", help="The format of the input file (fastq/fasta)", default="auto", choices=["fasta", "fastq", "auto"]) | |
70 parser.add_argument("-o", "--output-dir", help="The output dir") | |
71 parser.add_argument("-m", "--mapping-file", help="A tab seperated file containing two columns, ID and barcode (no header)") | |
72 | |
73 args = parser.parse_args() | |
74 | |
75 input_file_path = args.input | |
76 basename_input_file_path = os.path.basename(input_file_path) | |
77 input_basename_no_ext, input_extension = os.path.splitext(basename_input_file_path) | |
78 | |
79 input_format = args.format | |
80 | |
81 logging.basicConfig(stream=sys.stdout, level=logging.INFO) | |
82 | |
83 if input_format == "auto": | |
84 if input_extension in [".fasta", ".fa"]: | |
85 input_format = "fasta" | |
86 elif input_extension in [".fastq", ".fq"]: | |
87 input_format = "fastq" | |
88 else: | |
89 logging.info("Can't auto detect input format based on extension: {0}".format(input_extension)) | |
90 logging.info("Sniffing format...") | |
91 input_format = sniff_format(input_file_path) | |
92 if not input_format: | |
93 logging.error("Can't auto detect input format") | |
94 sys.exit(1) | |
95 logging.info("Sniffed '{0}' as format.".format(input_format)) | |
96 | |
97 output_dir = args.output_dir | |
98 if not os.path.exists(output_dir): | |
99 os.makedirs(output_dir) | |
100 | |
101 with open(args.mapping_file, newline="") as handle: | |
102 ID_barcode_mapping = list(csv.DictReader(handle, fieldnames=['ID', 'barcode'], delimiter="\t")) | |
103 | |
104 logging.info("Input: {0}".format(input_file_path)) | |
105 logging.info("Format: {0}".format(input_format)) | |
106 logging.info("Output: {0}".format(output_dir)) | |
107 logging.info("Mapping: {0} ({1} mappings)".format(args.mapping_file, len(ID_barcode_mapping))) | |
108 | |
109 BarcodeData = namedtuple("BarcodeData", [ | |
110 "ID", | |
111 "barcode", | |
112 "barcode_reverse", | |
113 "output_file_path", | |
114 "output_file_handle" | |
115 ]) | |
116 | |
117 barcode_data_dict = defaultdict(list) | |
118 ID_file_handle_dict = {} | |
119 | |
120 for ID_barcode in ID_barcode_mapping: | |
121 ID = ID_barcode["ID"] | |
122 barcode = ID_barcode["barcode"] | |
123 | |
124 output_file_path = os.path.join( | |
125 output_dir, | |
126 "{0}_{1}.{2}".format(input_basename_no_ext, ID, input_format) | |
127 ) | |
128 | |
129 if ID not in ID_file_handle_dict: | |
130 ID_file_handle = open(output_file_path, 'w') | |
131 ID_file_handle_dict[ID] = ID_file_handle | |
132 | |
133 ID_file_handle = ID_file_handle_dict[ID] | |
134 | |
135 barcode_data_dict[ID] += [BarcodeData( | |
136 ID=ID, | |
137 barcode=barcode, | |
138 barcode_reverse=str(Seq(barcode, generic_dna).reverse_complement()), | |
139 output_file_path=output_file_path, | |
140 output_file_handle=ID_file_handle | |
141 )] | |
142 | |
143 discarded_output_file_path = os.path.join( | |
144 output_dir, | |
145 "{0}_{1}.{2}".format(basename_input_file_path, "discarded", input_format) | |
146 ) | |
147 | |
148 total_sequences = 0 | |
149 sequences_assigned_by_id = defaultdict(int) | |
150 | |
151 with open(input_file_path, 'rU') as input_file_handle, open(discarded_output_file_path, 'w') as discarded_output_handle: | |
152 for record in SeqIO.parse(input_file_handle, input_format): | |
153 total_sequences += 1 | |
154 for ID, barcode_datas in barcode_data_dict.items(): | |
155 barcode_position, barcode_data, reverse = search_barcodes_in_sequence(barcode_datas, record) | |
156 if barcode_position == -1: | |
157 continue | |
158 barcode = barcode_data.barcode if not reverse else barcode_data.barcode_reverse | |
159 sequences_assigned_by_id[barcode_data.ID] += 1 | |
160 logging.debug(str(record.seq)) | |
161 logging.debug(" " * barcode_position + barcode) | |
162 SeqIO.write(record, ID_file_handle_dict[ID], input_format) | |
163 break | |
164 else: # no match # TODO fuzzy match ? | |
165 SeqIO.write(record, discarded_output_handle, input_format) | |
166 if total_sequences % 10000 == 0: | |
167 assigned_sequences = sum(sequences_assigned_by_id.values()) | |
168 logging.info("Processed {0} sequences, assigned {1} ({2}%)".format( | |
169 total_sequences, | |
170 assigned_sequences, | |
171 round(assigned_sequences / total_sequences * 100) | |
172 )) | |
173 | |
174 for file_handle in ID_file_handle_dict.values(): | |
175 file_handle.close() | |
176 | |
177 assigned_sequences = sum(sequences_assigned_by_id.values()) | |
178 logging.info("Processed {0} sequences, assigned {1} ({2}%)".format( | |
179 total_sequences, | |
180 assigned_sequences, | |
181 round(assigned_sequences / total_sequences * 100) | |
182 )) | |
183 | |
184 | |
185 if __name__ == "__main__": | |
186 main() | |
187 |