annotate UMI_riboseq_tool/UMI.py @ 11:0b5c3e35cddb draft

Uploaded
author jackcurragh
date Mon, 25 Jul 2022 11:34:03 +0000
parents d003917eaf2c
children cf1a0cbe5c34
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
10
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
1 import gzip
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
2 from sys import argv, exit
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
3 from itertools import zip_longest
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
4 from Bio import SeqIO
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
5
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
6 def grouper(iterable, n, fillvalue=None):
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
7 args = [iter(iterable)] * n
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
8 return zip_longest(*args, fillvalue=fillvalue)
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
9
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
10
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
11 def is_gz_file(filepath):
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
12 with open(filepath, 'rb') as test_f:
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
13 return test_f.read(2) == b'\x1f\x8b'
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
14
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
15
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
16 def process_fastq_record(record, output, UMI_5_prime_length=2, UMI_3_prime_length=5):
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
17 '''
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
18 Write UMI to FASTQ header for given biopython record to given open output
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
19 UMI_5_prime_length number of bases to trim from 5' and write to header
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
20 UMI_3_prime_length number of bases to trim from 3' and write to header
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
21 defaults are for McGlincy Ingolia Protocol
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
22 '''
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
23 lines = record.format('fastq').split('\n') # all 4 lines
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
24 header = lines[0]
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
25 seq = lines[1]
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
26 sep = lines[2]
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
27 qual = lines[3]
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
28
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
29 if (header.startswith('@')):
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
30 trimmed_seq = seq[UMI_5_prime_length:-(UMI_3_prime_length + 1)] # fooprint + barcode
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
31 UMI = seq[0:UMI_5_prime_length]+seq.rstrip()[-UMI_3_prime_length:len(seq)] #7nt in total; 5'NN and last 3'NNNNN
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
32 split_header = header.split(" ")
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
33 new_header = split_header[0]+"_"+UMI+" "+split_header[1]
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
34 new_qual = qual[UMI_5_prime_length:-(UMI_3_prime_length + 1)]
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
35 output.write(new_header+'\n')
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
36 output.write(trimmed_seq+'\n')
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
37 output.write(sep+'\n')
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
38 output.write(new_qual+'\n')
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
39
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
40
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
41
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
42 def UMI_processing(pathToFastaFile, output_path, bool_gzip, UMI_5_prime_length, UMI_3_prime_length):
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
43
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
44 if bool_gzip:
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
45 output = gzip.open(output_path,"wt")
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
46 else:
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
47 output = open(output_path, 'w')
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
48
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
49 if is_gz_file(pathToFastaFile) == True:
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
50 print ('file is gzipped fastq')
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
51 with gzip.open(pathToFastaFile, "rt") as handle:
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
52 for i, record in enumerate(SeqIO.parse(handle, "fastq")):
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
53 process_fastq_record(record, output, UMI_5_prime_length=2, UMI_3_prime_length=5)
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
54
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
55 else:
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
56 for record in SeqIO.parse(pathToFastaFile, 'fastq'):
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
57 process_fastq_record(record, output, UMI_5_prime_length, UMI_3_prime_length)
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
58
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
59 output.close()
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
60
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
61
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
62
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
63
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
64 def main():
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
65 if len(argv) != 6:
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
66 exit("Usage: 3 arguments required\n1: Path to fasta file \n2: name of output file\n3: string 'True' or 'False' whether to gzip output")
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
67
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
68 # Get paths
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
69 pathToFastaFile = argv[1]
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
70 output = argv[2]
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
71 bool_gzip = True if argv[3].lower().capitalize() == 'True' else False
11
0b5c3e35cddb Uploaded
jackcurragh
parents: 10
diff changeset
72 UMI_5_prime_length = int(argv[4])
0b5c3e35cddb Uploaded
jackcurragh
parents: 10
diff changeset
73 UMI_3_prime_length = int(argv[5])
10
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
74 UMI_processing(pathToFastaFile, output, bool_gzip, UMI_5_prime_length, UMI_3_prime_length)
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
75
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
76 if __name__ == "__main__":
d003917eaf2c Uploaded
jackcurragh
parents:
diff changeset
77 main()