annotate get_cov_stats.py @ 1:e5dc0426ca39 draft default tip

Uploaded
author p.lucas
date Tue, 11 Jun 2024 12:28:27 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
1 # -*- coding: utf-8 -*-
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
2 import os
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
3 import re
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
4 import argparse
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
5 import sys
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
6 import subprocess
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
7
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
8 from subprocess import Popen, PIPE
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
9
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
10 def get_contigs(bam):
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
11 header, err = Popen(["samtools", "view", "-H", bam], stdout=PIPE, stderr=PIPE).communicate()
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
12 headers = header.decode("utf-8").splitlines()
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
13 contigs = {}
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
14 for h in headers:
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
15 x = re.findall(r"@SQ\WSN:([A-Za-z0-9_\.\|-]*)\WLN:([0-9]+)", h)
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
16 if x != []:
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
17 contigs[x[0][0]] = int(x[0][1])
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
18 return contigs
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
19
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
20
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
21 def coverage(bam, paired):
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
22 if os.path.isfile(bam) is False:
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
23 raise Exception("Bam file does not exist")
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
24 subprocess.run(f"samtools index {bam}", shell=True)
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
25 contigs = get_contigs(bam)
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
26 coverage_dict = {}
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
27 command = "samtools stats %s | grep \"raw total seq\"" % (bam)
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
28 line = Popen(command, stdout=PIPE, shell=True).communicate()[0].decode("utf-8")
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
29 line = line.rstrip("\n").split("\t")
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
30 nb_reads = int(line[2])
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
31
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
32 for c in contigs.keys():
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
33 c_escaped = re.escape(c)
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
34 sam_command = f"samtools depth -d 0 -r {c_escaped} {bam}"
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
35 command = f"{sam_command} | awk '{{sum+=$3;cnt++}}END{{print cnt \"\t\" sum}}'"
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
36 coverage_dict[c] = {}
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
37 com = Popen(command, stdout=PIPE, shell=True).communicate()[0].decode("utf-8")
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
38 com = com.rstrip("\n").split("\t")
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
39 command = "samtools view -c -F 0x904 %s %s " % (bam, c_escaped)
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
40 mapped_read = Popen(command, stdout=PIPE, shell=True).communicate()[0].decode("utf-8")
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
41 if not com[0]:
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
42 com[0] = 0
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
43 if not com[1]:
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
44 com[1] = 0
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
45 # print("This is c: "+c)
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
46 # if isinstance(com[0], int) == False:
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
47 # com[0] = 0
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
48 # if isinstance(com[1], int) == False:
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
49 # com[1] = 0
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
50 # print("This is com 0: "+str(com[0]))
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
51 # print("This is com 1: "+str(com[1]))
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
52 coverage_dict[c]["Bases Mapped"] = int(com[0])
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
53 coverage_dict[c]["Sum of Depths"] = int(com[1])
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
54 coverage_dict[c]["Breadth of Coverage"] = coverage_dict[c]["Bases Mapped"] / float(contigs[c])
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
55 coverage_dict[c]["Depth of Coverage"] = coverage_dict[c]["Sum of Depths"] / float(contigs[c])
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
56 coverage_dict[c]["Length"] = int(contigs[c])
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
57 coverage_dict[c]["Mapped Reads"] = int(mapped_read)
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
58 # print("cov"+str(coverage_dict[c]["Depth of Coverage"]))
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
59 if coverage_dict[c]["Depth of Coverage"] > 0:
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
60 coverage_dict[c]["N Reads covX"] = 80 * nb_reads / coverage_dict[c]["Depth of Coverage"]
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
61 if paired:
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
62 coverage_dict[c]["N Reads covX"] = coverage_dict[c]["N Reads covX"] / 2.0
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
63 else:
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
64 coverage_dict[c]["N Reads covX"] = nb_reads
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
65
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
66 genome_length = float(sum(contigs.values()))
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
67 coverage_dict["genome"] = {}
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
68 coverage_dict["genome"]["Length"] = int(genome_length)
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
69 coverage_dict["genome"]["Total Reads"] = int(nb_reads)
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
70 bases_mapped = sum([x["Bases Mapped"] for k, x in coverage_dict.items() if k != "genome"])
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
71 coverage_dict["genome"]["Bases Mapped"] = bases_mapped
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
72 sum_of_depth = sum([x["Sum of Depths"] for k, x in coverage_dict.items() if k != "genome"])
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
73 coverage_dict["genome"]["Sum of Depths"] = sum_of_depth
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
74 # brea = sum([x["Bases Mapped"] for k, x in coverage_dict.items() if k!="genome"])/ genome_length
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
75 # coverage_dict["genome"]["Breadth of Coverage"] = brea
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
76 if (genome_length == 0):
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
77 sys.exit("genome_length is 0 line 61")
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
78 coverage_dict["genome"]["Depth of Coverage"] = float(sum_of_depth)/float(genome_length)
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
79 cov_dict = coverage_dict["genome"]["Depth of Coverage"]
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
80 mapped_reads = sum([x["Mapped Reads"] for k, x in coverage_dict.items() if k != "genome"])
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
81 coverage_dict["genome"]["Mapped Reads"] = mapped_reads
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
82 if coverage_dict["genome"]["Depth of Coverage"] > 0:
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
83 coverage_dict["genome"]["N Reads for assembly"] = round(80 * nb_reads / cov_dict)
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
84 else:
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
85 coverage_dict["genome"]["N Reads for assembly"] = nb_reads
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
86 if paired:
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
87 n_reads = round(coverage_dict["genome"]["N Reads for assembly"] / 2.0)
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
88 coverage_dict["genome"]["N Reads for assembly"] = n_reads
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
89
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
90 if coverage_dict["genome"]["N Reads for assembly"] > nb_reads:
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
91 coverage_dict["genome"]["N Reads for assembly"] = nb_reads
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
92
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
93 coverage = []
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
94 for k, v in coverage_dict.items():
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
95 print(k)
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
96 for x in v.items():
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
97 print("%s\t%s" % (x[0], str(x[1])))
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
98 coverage += [(k, x[0], x[1])]
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
99
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
100 return coverage
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
101
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
102
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
103 def __main__():
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
104 parser = argparse.ArgumentParser(description="""Calculate coverage from bam""")
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
105 parser.add_argument("-b", "-bam", dest="bam", help="bam file")
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
106 parser.add_argument("-p", "-paired", dest="paired", help="if reads paired", action="store_true")
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
107 options = parser.parse_args()
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
108
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
109 # Check options:
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
110 if len(sys.argv) < 3 or len(sys.argv) > 4:
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
111 parser.print_help()
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
112 sys.exit(1)
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
113
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
114 paired = False
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
115 if options.paired:
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
116 paired = True
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
117 bam = options.bam
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
118 cov = coverage(bam, paired)
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
119
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
120
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
121 if __name__ == "__main__":
e5dc0426ca39 Uploaded
p.lucas
parents:
diff changeset
122 __main__()