annotate UMI_riboseq_processing/UMI.py @ 8:701804f5ad4b draft

Uploaded
author triasteran
date Tue, 21 Jun 2022 13:22:08 +0000
parents be394fb47250
children 31438c26afec
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
6
1ce4b52212c4 Uploaded
triasteran
parents:
diff changeset
1 import gzip
1ce4b52212c4 Uploaded
triasteran
parents:
diff changeset
2 from mimetypes import guess_type
1ce4b52212c4 Uploaded
triasteran
parents:
diff changeset
3 from functools import partial
1ce4b52212c4 Uploaded
triasteran
parents:
diff changeset
4 from sys import argv, exit
1ce4b52212c4 Uploaded
triasteran
parents:
diff changeset
5 import itertools
1ce4b52212c4 Uploaded
triasteran
parents:
diff changeset
6 from itertools import zip_longest
8
701804f5ad4b Uploaded
triasteran
parents: 7
diff changeset
7 import subprocess
701804f5ad4b Uploaded
triasteran
parents: 7
diff changeset
8 from subprocess import call
6
1ce4b52212c4 Uploaded
triasteran
parents:
diff changeset
9
1ce4b52212c4 Uploaded
triasteran
parents:
diff changeset
10 def grouper(iterable, n, fillvalue=None):
1ce4b52212c4 Uploaded
triasteran
parents:
diff changeset
11 args = [iter(iterable)] * n
1ce4b52212c4 Uploaded
triasteran
parents:
diff changeset
12 return zip_longest(*args, fillvalue=fillvalue)
1ce4b52212c4 Uploaded
triasteran
parents:
diff changeset
13
1ce4b52212c4 Uploaded
triasteran
parents:
diff changeset
14
8
701804f5ad4b Uploaded
triasteran
parents: 7
diff changeset
15 def is_gz_file(filepath):
701804f5ad4b Uploaded
triasteran
parents: 7
diff changeset
16 with open(filepath, 'rb') as test_f:
701804f5ad4b Uploaded
triasteran
parents: 7
diff changeset
17 return test_f.read(2) == b'\x1f\x8b'
701804f5ad4b Uploaded
triasteran
parents: 7
diff changeset
18
6
1ce4b52212c4 Uploaded
triasteran
parents:
diff changeset
19
8
701804f5ad4b Uploaded
triasteran
parents: 7
diff changeset
20 def lines_parse(f, output_path):
701804f5ad4b Uploaded
triasteran
parents: 7
diff changeset
21 output = open(output_path,"w")
701804f5ad4b Uploaded
triasteran
parents: 7
diff changeset
22 for lines in grouper(f, 4, "\n"):
701804f5ad4b Uploaded
triasteran
parents: 7
diff changeset
23 header = lines[0]
701804f5ad4b Uploaded
triasteran
parents: 7
diff changeset
24 #print (header)
701804f5ad4b Uploaded
triasteran
parents: 7
diff changeset
25 seq = lines[1]
701804f5ad4b Uploaded
triasteran
parents: 7
diff changeset
26 sep = lines[2]
701804f5ad4b Uploaded
triasteran
parents: 7
diff changeset
27 qual = lines[3]
701804f5ad4b Uploaded
triasteran
parents: 7
diff changeset
28 # check if header is OK
701804f5ad4b Uploaded
triasteran
parents: 7
diff changeset
29 if (header.startswith('@')):
701804f5ad4b Uploaded
triasteran
parents: 7
diff changeset
30 trimmed_seq = seq[2:-6]+"\n" # fooprint + barcode
701804f5ad4b Uploaded
triasteran
parents: 7
diff changeset
31 UMI = seq[0:2]+seq.rstrip()[-5:] #7nt in total; 5'NN and last 3'NNNNN
701804f5ad4b Uploaded
triasteran
parents: 7
diff changeset
32 split_header = header.split(" ")
701804f5ad4b Uploaded
triasteran
parents: 7
diff changeset
33 new_header = split_header[0]+"_"+UMI+" "+split_header[1]
701804f5ad4b Uploaded
triasteran
parents: 7
diff changeset
34 if qual[-1:] == "\n":
701804f5ad4b Uploaded
triasteran
parents: 7
diff changeset
35 new_qual = qual[2:-6]+"\n"
701804f5ad4b Uploaded
triasteran
parents: 7
diff changeset
36 else:
701804f5ad4b Uploaded
triasteran
parents: 7
diff changeset
37 new_qual = qual[2:-6]
701804f5ad4b Uploaded
triasteran
parents: 7
diff changeset
38 output.write(new_header)
701804f5ad4b Uploaded
triasteran
parents: 7
diff changeset
39 output.write(trimmed_seq)
701804f5ad4b Uploaded
triasteran
parents: 7
diff changeset
40 output.write(sep)
701804f5ad4b Uploaded
triasteran
parents: 7
diff changeset
41 output.write(new_qual)
6
1ce4b52212c4 Uploaded
triasteran
parents:
diff changeset
42 output.close()
1ce4b52212c4 Uploaded
triasteran
parents:
diff changeset
43
8
701804f5ad4b Uploaded
triasteran
parents: 7
diff changeset
44
701804f5ad4b Uploaded
triasteran
parents: 7
diff changeset
45 def UMI_processing(pathToFastaFile, output_path):
6
1ce4b52212c4 Uploaded
triasteran
parents:
diff changeset
46
8
701804f5ad4b Uploaded
triasteran
parents: 7
diff changeset
47 if is_gz_file(pathToFastaFile) == True:
701804f5ad4b Uploaded
triasteran
parents: 7
diff changeset
48 with gzip.open(pathToFastaFile, 'rb') as file:
701804f5ad4b Uploaded
triasteran
parents: 7
diff changeset
49 f = [x.decode("utf-8") for x in file.readlines()]
701804f5ad4b Uploaded
triasteran
parents: 7
diff changeset
50
701804f5ad4b Uploaded
triasteran
parents: 7
diff changeset
51 else:
701804f5ad4b Uploaded
triasteran
parents: 7
diff changeset
52 with open(pathToFastaFile, 'r') as file:
701804f5ad4b Uploaded
triasteran
parents: 7
diff changeset
53 f = file.readlines()
701804f5ad4b Uploaded
triasteran
parents: 7
diff changeset
54
701804f5ad4b Uploaded
triasteran
parents: 7
diff changeset
55 lines_parse(f, output_path)
701804f5ad4b Uploaded
triasteran
parents: 7
diff changeset
56
6
1ce4b52212c4 Uploaded
triasteran
parents:
diff changeset
57 def main():
1ce4b52212c4 Uploaded
triasteran
parents:
diff changeset
58 if len(argv) != 3:
1ce4b52212c4 Uploaded
triasteran
parents:
diff changeset
59 exit("Usage: 2 arguments required\n1: Path to fasta file \n2: name of output file")
1ce4b52212c4 Uploaded
triasteran
parents:
diff changeset
60
1ce4b52212c4 Uploaded
triasteran
parents:
diff changeset
61 # Get paths
1ce4b52212c4 Uploaded
triasteran
parents:
diff changeset
62 pathToFastaFile = argv[1]
1ce4b52212c4 Uploaded
triasteran
parents:
diff changeset
63 output = argv[2]
8
701804f5ad4b Uploaded
triasteran
parents: 7
diff changeset
64 UMI_processing(pathToFastaFile, output)
6
1ce4b52212c4 Uploaded
triasteran
parents:
diff changeset
65
1ce4b52212c4 Uploaded
triasteran
parents:
diff changeset
66 if __name__ == "__main__":
1ce4b52212c4 Uploaded
triasteran
parents:
diff changeset
67 main()