annotate UMI_riboseq_processing/UMI_test.py @ 5:e370df93715d draft

Uploaded
author triasteran
date Tue, 21 Jun 2022 09:22:37 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
5
e370df93715d Uploaded
triasteran
parents:
diff changeset
1 import gzip
e370df93715d Uploaded
triasteran
parents:
diff changeset
2 from mimetypes import guess_type
e370df93715d Uploaded
triasteran
parents:
diff changeset
3 from functools import partial
e370df93715d Uploaded
triasteran
parents:
diff changeset
4 from sys import argv, exit
e370df93715d Uploaded
triasteran
parents:
diff changeset
5 import itertools
e370df93715d Uploaded
triasteran
parents:
diff changeset
6 from itertools import zip_longest
e370df93715d Uploaded
triasteran
parents:
diff changeset
7
e370df93715d Uploaded
triasteran
parents:
diff changeset
8
e370df93715d Uploaded
triasteran
parents:
diff changeset
9 def grouper(iterable, n, fillvalue=None):
e370df93715d Uploaded
triasteran
parents:
diff changeset
10 args = [iter(iterable)] * n
e370df93715d Uploaded
triasteran
parents:
diff changeset
11 return zip_longest(*args, fillvalue=fillvalue)
e370df93715d Uploaded
triasteran
parents:
diff changeset
12
e370df93715d Uploaded
triasteran
parents:
diff changeset
13
e370df93715d Uploaded
triasteran
parents:
diff changeset
14 chunk_size=4
e370df93715d Uploaded
triasteran
parents:
diff changeset
15
e370df93715d Uploaded
triasteran
parents:
diff changeset
16 def copy_UMI_to_header_and_output_trimmed_read(pathToFastaFile, output):
e370df93715d Uploaded
triasteran
parents:
diff changeset
17 # find wheather its plain or gzipped fastq
e370df93715d Uploaded
triasteran
parents:
diff changeset
18 encoding = guess_type(pathToFastaFile)[1] # uses file extension
e370df93715d Uploaded
triasteran
parents:
diff changeset
19 _open = partial(gzip.open, mode='rt') if encoding == 'gzip' else open
e370df93715d Uploaded
triasteran
parents:
diff changeset
20 # output file will be in gz format
e370df93715d Uploaded
triasteran
parents:
diff changeset
21 output = gzip.open(output,"wt")
e370df93715d Uploaded
triasteran
parents:
diff changeset
22 # open and parse
e370df93715d Uploaded
triasteran
parents:
diff changeset
23 with _open(pathToFastaFile) as f:
e370df93715d Uploaded
triasteran
parents:
diff changeset
24 for lines in grouper(f, chunk_size, ""):
e370df93715d Uploaded
triasteran
parents:
diff changeset
25 #lines = record.format('fastq').split('\n') # list of each record: id, seq, '+', quality
e370df93715d Uploaded
triasteran
parents:
diff changeset
26 header = lines[0]
e370df93715d Uploaded
triasteran
parents:
diff changeset
27 seq = lines[1]
e370df93715d Uploaded
triasteran
parents:
diff changeset
28 sep = lines[2]
e370df93715d Uploaded
triasteran
parents:
diff changeset
29 qual = lines[3]
e370df93715d Uploaded
triasteran
parents:
diff changeset
30 trimmed_seq = seq[2:-6]+"\n" # fooprint + barcode
e370df93715d Uploaded
triasteran
parents:
diff changeset
31 UMI = seq[0:2]+seq.rstrip()[-5:] #7nt in total; 5'NN and last 3'NNNNN
e370df93715d Uploaded
triasteran
parents:
diff changeset
32 split_header = header.split(" ")
e370df93715d Uploaded
triasteran
parents:
diff changeset
33 new_header = split_header[0]+"_"+UMI+" "+split_header[1]
e370df93715d Uploaded
triasteran
parents:
diff changeset
34 if qual[-1:] == "\n":
e370df93715d Uploaded
triasteran
parents:
diff changeset
35 new_qual = qual[2:-6]+"\n"
e370df93715d Uploaded
triasteran
parents:
diff changeset
36 else:
e370df93715d Uploaded
triasteran
parents:
diff changeset
37 new_qual = qual[2:-6]
e370df93715d Uploaded
triasteran
parents:
diff changeset
38 output.write(new_header)
e370df93715d Uploaded
triasteran
parents:
diff changeset
39 output.write(trimmed_seq)
e370df93715d Uploaded
triasteran
parents:
diff changeset
40 output.write(sep)
e370df93715d Uploaded
triasteran
parents:
diff changeset
41 output.write(new_qual)
e370df93715d Uploaded
triasteran
parents:
diff changeset
42
e370df93715d Uploaded
triasteran
parents:
diff changeset
43 output.close()
e370df93715d Uploaded
triasteran
parents:
diff changeset
44
e370df93715d Uploaded
triasteran
parents:
diff changeset
45
e370df93715d Uploaded
triasteran
parents:
diff changeset
46 def main():
e370df93715d Uploaded
triasteran
parents:
diff changeset
47 if len(argv) != 3:
e370df93715d Uploaded
triasteran
parents:
diff changeset
48 exit("Usage: 2 arguments required\n1: Path to fasta file \n2: name of output file")
e370df93715d Uploaded
triasteran
parents:
diff changeset
49
e370df93715d Uploaded
triasteran
parents:
diff changeset
50 # Get paths
e370df93715d Uploaded
triasteran
parents:
diff changeset
51 pathToFastaFile = argv[1]
e370df93715d Uploaded
triasteran
parents:
diff changeset
52 output = argv[2]
e370df93715d Uploaded
triasteran
parents:
diff changeset
53 copy_UMI_to_header_and_output_trimmed_read(pathToFastaFile, output)
e370df93715d Uploaded
triasteran
parents:
diff changeset
54
e370df93715d Uploaded
triasteran
parents:
diff changeset
55 if __name__ == "__main__":
e370df93715d Uploaded
triasteran
parents:
diff changeset
56 main()