Mercurial > repos > artbio > mircounts
comparison mircounts.py @ 0:da29af78a960 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mircounts commit d4d8106d66b65679a1a685ab94bfcf99cdb7b959
author | artbio |
---|---|
date | Mon, 24 Jul 2017 06:27:50 -0400 |
parents | |
children | da1aa7de2b19 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:da29af78a960 |
---|---|
1 #!/usr/bin/env python | |
2 import argparse | |
3 | |
4 import pysam | |
5 | |
6 | |
7 def Parser(): | |
8 parser = argparse.ArgumentParser(description='miRNAs counts and coverages') | |
9 parser.add_argument('-a', '--alignment', metavar='FILE', type=str, | |
10 dest='alignment_file', help='Alignment bam file') | |
11 parser.add_argument('--gff', metavar='FILE', type=str, dest='gff_file', | |
12 help='GFF3 describing both pre-miRNAs\ | |
13 and mature miRNAs') | |
14 parser.add_argument('-q', '--quality_threshold', type=int, | |
15 dest='quality_threshold', | |
16 help='Quality threshold for coverage (default=10)', | |
17 default=10) | |
18 parser.add_argument('-p', '--pre_mirs', type=str, dest='pre_mirs', | |
19 help='pre-miRNAs count file path', metavar='FILE') | |
20 parser.add_argument('-m', '--mirs', type=str, dest='mirs', | |
21 help='mature miRNA count file path', metavar='FILE') | |
22 parser.add_argument('--lattice', metavar='FILE', type=str, dest='lattice', | |
23 help='Output file for the lattice dataframe.') | |
24 args = parser.parse_args() | |
25 return args | |
26 | |
27 | |
28 def get_pre_mir_counts(bamfile): | |
29 """ | |
30 Takes a AlignmentFile object and returns a dictionary of counts for reads | |
31 aligning with pre_mirs (as keys) | |
32 """ | |
33 count = dict() | |
34 for ref_name in bamfile.references: | |
35 count[ref_name] = bamfile.count(reference=ref_name) | |
36 return count | |
37 | |
38 | |
39 def get_pre_mir_coverage(bamfile, quality=10): | |
40 """ | |
41 Takes a AlignmentFile object and returns a dictionary of lists | |
42 of coverage along the coordinates of pre_mirs (as keys) | |
43 """ | |
44 coverage = dict() | |
45 for ref_name, ref_len in zip(bamfile.references, bamfile.lengths): | |
46 coverage[ref_name] = bamfile.count_coverage(reference=ref_name, | |
47 start=0, end=ref_len, | |
48 quality_threshold=quality) | |
49 """ Add the 4 coverage values """ | |
50 coverage[ref_name] = [sum(x) for x in | |
51 zip(*coverage[ref_name])] | |
52 return coverage | |
53 | |
54 | |
55 def get_mir_counts(bamfile, gff_file): | |
56 """ | |
57 Takes a AlignmentFile and a gff file and computes for | |
58 each 'miRNA' region of the gff the number of reads that hit it | |
59 returns a dict[mir_name] = count | |
60 """ | |
61 counts = dict() | |
62 for line in open(gff_file, 'r'): | |
63 if line[0] != '#': | |
64 gff_fields = line[:-1].split("\t") | |
65 if gff_fields[2] == 'miRNA': | |
66 mir_name = gff_fields[0] | |
67 premir_name = gff_fields[8].split('=')[-1] | |
68 mir_start = int(gff_fields[3]) | |
69 mir_end = int(gff_fields[4]) | |
70 # GFF is 1-based, pysam is 0-based. | |
71 counts[mir_name] = bamfile.count(reference=premir_name, | |
72 start=mir_start-1, | |
73 end=mir_end-1) | |
74 return counts | |
75 | |
76 | |
77 def write_dataframe_coverage(countdict, outfile): | |
78 """ | |
79 Takes a dict[pre_mir reference name] = [coverage list] | |
80 and writes a dataframe with columns: | |
81 <gene_type name>, offset, normoffset, counts and normcounts | |
82 in the outfile | |
83 """ | |
84 F = open(outfile, 'w') | |
85 F.write('Mir_hairpin\tOffset\tNorm_offset\tCount\tNorm_count\n') | |
86 for ref in sorted(countdict): | |
87 """ | |
88 For each reference name in mirs, | |
89 write the coverage of each of its positions | |
90 """ | |
91 maximum = max(countdict[ref]) | |
92 reference_length = len(countdict[ref]) | |
93 for pos, c in enumerate(countdict[ref]): | |
94 """ Compute and write value for each reference position""" | |
95 F.write('%s\t%s\t%s\t%s\t%s\n' % (ref, str(pos + 1), | |
96 str(float(pos+1)/reference_length), str(float(c)), | |
97 str(float(c)/maximum) if maximum != 0 else '0')) | |
98 F.close() | |
99 | |
100 | |
101 def write_counts(countdict, outfile): | |
102 """ | |
103 Takes a dict[<gene_type name>]=count and | |
104 writes a count table | |
105 """ | |
106 F = open(outfile, 'w') | |
107 F.write('Gene\tCounts\n') | |
108 for gene in sorted(countdict): | |
109 F.write('%s\t%s\n' % (gene, str(countdict[gene]))) | |
110 F.close() | |
111 | |
112 | |
113 def main(): | |
114 args = Parser() | |
115 bamfile = pysam.AlignmentFile(args.alignment_file, 'rb', check_sq=False) | |
116 if args.pre_mirs: | |
117 pre_mirs = get_pre_mir_counts(bamfile) | |
118 write_counts(pre_mirs, args.pre_mirs) | |
119 if args.lattice: | |
120 pre_mirs_coverage = get_pre_mir_coverage(bamfile, | |
121 args.quality_threshold) | |
122 write_dataframe_coverage(pre_mirs_coverage, args.lattice) | |
123 if args.mirs: | |
124 mirs = get_mir_counts(bamfile, args.gff_file) | |
125 write_counts(mirs, args.mirs) | |
126 | |
127 | |
128 if __name__ == '__main__': | |
129 main() |