6
|
1 import gzip
|
|
2 from mimetypes import guess_type
|
|
3 from functools import partial
|
|
4 from sys import argv, exit
|
|
5 import itertools
|
|
6 from itertools import zip_longest
|
8
|
7 import subprocess
|
|
8 from subprocess import call
|
6
|
9
|
|
10 def grouper(iterable, n, fillvalue=None):
|
|
11 args = [iter(iterable)] * n
|
|
12 return zip_longest(*args, fillvalue=fillvalue)
|
|
13
|
|
14
|
8
|
15 def is_gz_file(filepath):
|
|
16 with open(filepath, 'rb') as test_f:
|
|
17 return test_f.read(2) == b'\x1f\x8b'
|
|
18
|
6
|
19
|
8
|
20 def lines_parse(f, output_path):
|
|
21 output = open(output_path,"w")
|
|
22 for lines in grouper(f, 4, "\n"):
|
|
23 header = lines[0]
|
|
24 #print (header)
|
|
25 seq = lines[1]
|
|
26 sep = lines[2]
|
|
27 qual = lines[3]
|
|
28 # check if header is OK
|
|
29 if (header.startswith('@')):
|
|
30 trimmed_seq = seq[2:-6]+"\n" # fooprint + barcode
|
|
31 UMI = seq[0:2]+seq.rstrip()[-5:] #7nt in total; 5'NN and last 3'NNNNN
|
|
32 split_header = header.split(" ")
|
|
33 new_header = split_header[0]+"_"+UMI+" "+split_header[1]
|
|
34 if qual[-1:] == "\n":
|
|
35 new_qual = qual[2:-6]+"\n"
|
|
36 else:
|
|
37 new_qual = qual[2:-6]
|
|
38 output.write(new_header)
|
|
39 output.write(trimmed_seq)
|
|
40 output.write(sep)
|
|
41 output.write(new_qual)
|
6
|
42 output.close()
|
|
43
|
8
|
44
|
|
45 def UMI_processing(pathToFastaFile, output_path):
|
6
|
46
|
8
|
47 if is_gz_file(pathToFastaFile) == True:
|
|
48 with gzip.open(pathToFastaFile, 'rb') as file:
|
|
49 f = [x.decode("utf-8") for x in file.readlines()]
|
|
50
|
|
51 else:
|
|
52 with open(pathToFastaFile, 'r') as file:
|
|
53 f = file.readlines()
|
|
54
|
|
55 lines_parse(f, output_path)
|
|
56
|
6
|
57 def main():
|
|
58 if len(argv) != 3:
|
|
59 exit("Usage: 2 arguments required\n1: Path to fasta file \n2: name of output file")
|
|
60
|
|
61 # Get paths
|
|
62 pathToFastaFile = argv[1]
|
|
63 output = argv[2]
|
8
|
64 UMI_processing(pathToFastaFile, output)
|
6
|
65
|
|
66 if __name__ == "__main__":
|
|
67 main()
|