annotate UMI_riboseq_processing/UMI.py @ 2:6958515efa76 draft

Uploaded
author triasteran
date Mon, 20 Jun 2022 07:27:23 +0000
parents 5d0d5933d370
children d27375bc4a1c
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
1 import itertools
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
2 from sys import argv, exit
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
3 from itertools import zip_longest
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
4
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
5 def grouper(iterable, n, fillvalue=None):
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
6 args = [iter(iterable)] * n
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
7 return zip_longest(*args, fillvalue=fillvalue)
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
8
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
9
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
10 chunk_size=4
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
11
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
12
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
13 def trimandpaste(pathToFastaFile, output):
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
14 #filename = pathToFastaFile.split('/')[-1]
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
15 output = open(output,"w")
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
16 with open(pathToFastaFile) as f:
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
17 for lines in grouper(f, chunk_size, ""): #for every chunk_sized chunk
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
18 header = lines[0]
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
19 seq = lines[1]
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
20 sep = lines[2]
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
21 qual = lines[3]
1
5d0d5933d370 Uploaded
triasteran
parents: 0
diff changeset
22 trimmed_seq = seq[2:-5]+"\n" # fooprint + barcode
2
6958515efa76 Uploaded
triasteran
parents: 1
diff changeset
23 UMI = seq[0:2]+seq[-5:].strip('\n') #7nt in total; 5'NN and last 3'NNNNN
0
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
24 split_header = header.split(" ")
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
25 new_header = split_header[0]+"_"+UMI+" "+split_header[1]
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
26 if qual[-1:] == "\n":
1
5d0d5933d370 Uploaded
triasteran
parents: 0
diff changeset
27 new_qual = qual[2:-5]+"\n"
0
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
28 else:
1
5d0d5933d370 Uploaded
triasteran
parents: 0
diff changeset
29 new_qual = qual[2:-5]
0
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
30 output.write(new_header)
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
31 output.write(trimmed_seq)
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
32 output.write(sep)
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
33 output.write(new_qual)
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
34
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
35 output.close()
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
36
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
37 def main():
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
38 if len(argv) != 3:
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
39 exit("Usage: 2 arguments required\n1: Path to fasta file \n2: name of output file")
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
40
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
41 # Get paths
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
42 pathToFastaFile = argv[1]
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
43 output = argv[2]
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
44
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
45 trimandpaste(pathToFastaFile, output)
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
46
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
47 if __name__ == "__main__":
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
48 main()