annotate demultiplex.py @ 2:500c2eee063d draft

Uploaded
author davidvanzessen
date Fri, 09 Nov 2018 05:49:26 -0500
parents 36c79869620b
children b6d63b9efb8f
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
1 import argparse
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
2 import csv
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
3 import logging
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
4 import os
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
5 import sys
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
6 from collections import defaultdict
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
7 from collections import namedtuple
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
8
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
9 from Bio import SeqIO
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
10 from Bio.Alphabet import generic_dna
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
11 from Bio.Seq import Seq
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
12 from Bio.SeqRecord import SeqRecord
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
13
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
14
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
15 def sniff_format(file_path):
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
16 """
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
17 Try to guess the file format (fastq/fasta) by looking at the first character of the first line.
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
18 Should be '@' for fastq and '>' for fasta.
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
19 """
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
20 with open(file_path, 'rU') as file_handle:
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
21 for line in file_handle:
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
22 if line.startswith("@"):
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
23 return "fastq"
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
24 if line.startswith(">"):
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
25 return "fasta"
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
26 break
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
27 return None
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
28
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
29
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
30 def search_barcode_in_first_half(sequence, barcode):
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
31 if type(sequence) is Seq:
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
32 sequence = str(sequence)
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
33 elif type(sequence) is SeqRecord:
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
34 sequence = str(sequence.seq)
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
35 return sequence.find(barcode, 0, int(len(sequence) / 2))
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
36
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
37
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
38 def search_barcode_in_second_half(sequence, barcode):
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
39 if type(sequence) is Seq:
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
40 sequence = str(sequence)
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
41 elif type(sequence) is SeqRecord:
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
42 sequence = str(sequence.seq)
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
43 return sequence.find(barcode, int(len(sequence) / 2))
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
44
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
45
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
46 def search_barcodes_in_sequence(barcode_datas, sequence):
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
47 for barcode_data in barcode_datas:
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
48 barcode_search_position = search_barcode_in_first_half(sequence, barcode_data.barcode)
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
49 if barcode_search_position != -1:
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
50 return barcode_search_position, barcode_data, False
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
51 barcode_search_position = search_barcode_in_second_half(sequence, barcode_data.barcode_reverse)
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
52 if barcode_search_position != -1:
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
53 return barcode_search_position, barcode_data, True
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
54
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
55 barcode_search_position = search_barcode_in_first_half(sequence, barcode_data.barcode_reverse)
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
56 if barcode_search_position != -1:
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
57 return barcode_search_position, barcode_data, True
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
58
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
59 barcode_search_position = search_barcode_in_second_half(sequence, barcode_data.barcode)
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
60 if barcode_search_position != -1:
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
61 return barcode_search_position, barcode_data, False
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
62
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
63 return -1, None, None
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
64
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
65
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
66 def main():
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
67 parser = argparse.ArgumentParser()
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
68 parser.add_argument("-i", "--input", help="The input file")
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
69 parser.add_argument("-f", "--format", help="The format of the input file (fastq/fasta)", default="auto", choices=["fasta", "fastq", "auto"])
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
70 parser.add_argument("-o", "--output-dir", help="The output dir")
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
71 parser.add_argument("-m", "--mapping-file", help="A tab seperated file containing two columns, ID and barcode (no header)")
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
72
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
73 args = parser.parse_args()
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
74
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
75 input_file_path = args.input
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
76 basename_input_file_path = os.path.basename(input_file_path)
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
77 input_basename_no_ext, input_extension = os.path.splitext(basename_input_file_path)
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
78
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
79 input_format = args.format
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
80
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
81 logging.basicConfig(stream=sys.stdout, level=logging.INFO)
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
82
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
83 if input_format == "auto":
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
84 if input_extension in [".fasta", ".fa"]:
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
85 input_format = "fasta"
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
86 elif input_extension in [".fastq", ".fq"]:
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
87 input_format = "fastq"
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
88 else:
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
89 logging.info("Can't auto detect input format based on extension: {0}".format(input_extension))
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
90 logging.info("Sniffing format...")
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
91 input_format = sniff_format(input_file_path)
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
92 if not input_format:
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
93 logging.error("Can't auto detect input format")
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
94 sys.exit(1)
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
95 logging.info("Sniffed '{0}' as format.".format(input_format))
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
96
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
97 output_dir = args.output_dir
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
98 if not os.path.exists(output_dir):
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
99 os.makedirs(output_dir)
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
100
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
101 with open(args.mapping_file, newline="") as handle:
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
102 ID_barcode_mapping = list(csv.DictReader(handle, fieldnames=['ID', 'barcode'], delimiter="\t"))
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
103
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
104 logging.info("Input: {0}".format(input_file_path))
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
105 logging.info("Format: {0}".format(input_format))
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
106 logging.info("Output: {0}".format(output_dir))
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
107 logging.info("Mapping: {0} ({1} mappings)".format(args.mapping_file, len(ID_barcode_mapping)))
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
108
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
109 BarcodeData = namedtuple("BarcodeData", [
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
110 "ID",
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
111 "barcode",
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
112 "barcode_reverse",
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
113 "output_file_path",
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
114 "output_file_handle"
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
115 ])
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
116
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
117 barcode_data_dict = defaultdict(list)
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
118 ID_file_handle_dict = {}
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
119
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
120 for ID_barcode in ID_barcode_mapping:
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
121 ID = ID_barcode["ID"]
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
122 barcode = ID_barcode["barcode"]
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
123
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
124 output_file_path = os.path.join(
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
125 output_dir,
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
126 "{0}_{1}.{2}".format(input_basename_no_ext, ID, input_format)
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
127 )
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
128
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
129 if ID not in ID_file_handle_dict:
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
130 ID_file_handle = open(output_file_path, 'w')
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
131 ID_file_handle_dict[ID] = ID_file_handle
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
132
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
133 ID_file_handle = ID_file_handle_dict[ID]
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
134
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
135 barcode_data_dict[ID] += [BarcodeData(
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
136 ID=ID,
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
137 barcode=barcode,
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
138 barcode_reverse=str(Seq(barcode, generic_dna).reverse_complement()),
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
139 output_file_path=output_file_path,
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
140 output_file_handle=ID_file_handle
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
141 )]
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
142
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
143 discarded_output_file_path = os.path.join(
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
144 output_dir,
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
145 "{0}_{1}.{2}".format(basename_input_file_path, "discarded", input_format)
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
146 )
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
147
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
148 total_sequences = 0
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
149 sequences_assigned_by_id = defaultdict(int)
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
150
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
151 with open(input_file_path, 'rU') as input_file_handle, open(discarded_output_file_path, 'w') as discarded_output_handle:
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
152 for record in SeqIO.parse(input_file_handle, input_format):
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
153 total_sequences += 1
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
154 for ID, barcode_datas in barcode_data_dict.items():
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
155 barcode_position, barcode_data, reverse = search_barcodes_in_sequence(barcode_datas, record)
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
156 if barcode_position == -1:
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
157 continue
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
158 barcode = barcode_data.barcode if not reverse else barcode_data.barcode_reverse
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
159 sequences_assigned_by_id[barcode_data.ID] += 1
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
160 logging.debug(str(record.seq))
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
161 logging.debug(" " * barcode_position + barcode)
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
162 SeqIO.write(record, ID_file_handle_dict[ID], input_format)
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
163 break
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
164 else: # no match # TODO fuzzy match ?
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
165 SeqIO.write(record, discarded_output_handle, input_format)
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
166 if total_sequences % 10000 == 0:
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
167 assigned_sequences = sum(sequences_assigned_by_id.values())
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
168 logging.info("Processed {0} sequences, assigned {1} ({2}%)".format(
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
169 total_sequences,
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
170 assigned_sequences,
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
171 round(assigned_sequences / total_sequences * 100)
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
172 ))
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
173
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
174 for file_handle in ID_file_handle_dict.values():
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
175 file_handle.close()
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
176
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
177 assigned_sequences = sum(sequences_assigned_by_id.values())
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
178 logging.info("Processed {0} sequences, assigned {1} ({2}%)".format(
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
179 total_sequences,
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
180 assigned_sequences,
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
181 round(assigned_sequences / total_sequences * 100)
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
182 ))
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
183
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
184
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
185 if __name__ == "__main__":
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
186 main()
36c79869620b Uploaded
davidvanzessen
parents:
diff changeset
187