comparison sam2counts_galaxy_edger.py @ 0:ce3a667012c2 draft

Uploaded
author nikhil-joshi
date Thu, 22 Jan 2015 03:54:22 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:ce3a667012c2
1 #!/usr/bin/python
2
3 """
4 sam2count_galaxy_edger.py -- Take SAM files and output a table of counts with column
5 names that are the filenames, and rowname that are the reference
6 names.
7 """
8
9 VERSION = 0.90
10
11 import sys
12 import csv
13 from os import path
14 try:
15 import pysam
16 except ImportError:
17 sys.exit("pysam not installed; please install it\n")
18
19 import argparse
20
21 def SAM_file_to_counts(filename, sname, ftype="sam", extra=False, use_all_references=True):
22 """
23 Take SAM filename, and create a hash of mapped and unmapped reads;
24 keys are reference sequences, values are the counts of occurences.
25
26 Also, a hash of qualities (either 0 or >0) of mapped reads
27 is output, which is handy for diagnostics.
28 """
29 counts = dict()
30 unique = dict()
31 nonunique = dict()
32 mode = 'r'
33 if ftype == "bam":
34 mode = 'rb'
35 sf = pysam.Samfile(filename, mode)
36
37 if use_all_references:
38 # Make dictionary of all entries in header
39 try:
40 for sn in sf.header['SQ']:
41 if extra:
42 unique[sn['SN']] = 0
43 nonunique[sn['SN']] = 0
44 counts[sn['SN']] = 0
45 except KeyError:
46 print "Sample file of sample " + sname + " does not have header."
47
48 for read in sf:
49 if not read.is_unmapped:
50 id_name = sf.getrname(read.rname) if read.rname != -1 else 0
51
52 if not use_all_references and not counts.get(id_name, False):
53 ## Only make keys based on aligning reads, make empty hash
54 if extra:
55 unique[id_name] = 0
56 nonunique[id_name] = 0
57 ## initiate entry; even if not mapped, record 0 count
58 counts[id_name] = counts.get(id_name, 0)
59
60
61 counts[id_name] = counts.get(id_name, 0) + 1
62
63 if extra:
64 if read.mapq == 0:
65 nonunique[id_name] = nonunique[id_name] + 1
66 else:
67 unique[id_name] = unique[id_name] + 1
68
69 if extra:
70 return {'counts':counts, 'unique':unique, 'nonunique':nonunique}
71
72 return {'counts':counts}
73
74 def collapsed_nested_count_dict(counts_dict, all_ids, order=None):
75 """
76 Takes a nested dictionary `counts_dict` and `all_ids`, which is
77 built with the `table_dict`. All files (first keys) in
78 `counts_dict` are made into columns with order specified by
79 `order`.
80
81 Output is a dictionary with keys that are the id's (genes or
82 transcripts), with values that are ordered counts. A header will
83 be created on the first row from the ordered columns (extracted
84 from filenames).
85 """
86 if order is None:
87 col_order = counts_dict.keys()
88 else:
89 col_order = order
90
91 collapsed_dict = dict()
92 for i, filename in enumerate(col_order):
93 for id_name in all_ids:
94 if not collapsed_dict.get(id_name, False):
95 collapsed_dict[id_name] = list()
96
97 # get counts and append
98 c = counts_dict[filename].get(id_name, 0)
99 collapsed_dict[id_name].append(c)
100 return {'table':collapsed_dict, 'header':col_order}
101
102
103 def counts_to_file(table_dict, outfilename, delimiter=','):
104 """
105 A function for its side-effect of writing `table_dict` (which
106 contains a table and header), to `outfilename` with the specified
107 `delimiter`.
108 """
109 writer = csv.writer(open(outfilename, 'a'), delimiter=delimiter, lineterminator='\n')
110 table = table_dict['table']
111 header = table_dict['header']
112
113 #header_row = True
114 for id_name, fields in table.items():
115 #if header_row:
116 #row = ['id'] + header
117 #writer.writerow(row)
118 #header_row = False
119
120 if id_name == 0:
121 continue
122 row = [id_name]
123 row.extend(fields)
124 writer.writerow(row)
125
126 if __name__ == '__main__':
127 parser = argparse.ArgumentParser(description='parser for sam2counts')
128 parser.add_argument("-d", "--delimiter", help="the delimiter (default: tab)", default='\t')
129 parser.add_argument("-o", "--out-file", help="output filename (default: counts.txt)", default='counts.txt')
130 parser.add_argument("-u", "--extra-output", help="output extra information on non-unique and unique mappers (default: False)")
131 parser.add_argument("-r", "--use-all-references", dest="use_all_references",
132 help="Use all the references from the SAM header (default: True)",
133 default=True, action="store_false")
134 parser.add_argument("-f", "--extra-out-files", dest="extra_out_files",
135 help="comma-delimited filenames of unique and non-unique output "
136 "(default: unique.txt,nonunique.txt)",
137 default='unique.txt,nonunique.txt')
138 parser.add_argument("-v", "--verbose", dest="verbose",
139 help="enable verbose output")
140 parser.add_argument("--bam-file", help="bam file", nargs="+", action="append", required=True)
141 parser.add_argument("--group", help="group", nargs="+", action="append", required=True)
142 parser.add_argument("--treatment", help="treatment", nargs="+", action="append", required=True)
143 parser.add_argument("--sample-name", help="sample name", nargs="+", action="append", required=True)
144 parser.add_argument("--file-type", help="file type", nargs="+", action="append", required=True, choices=['sam','bam'])
145 args = parser.parse_args()
146
147 args.bam_file = [item for sublist in args.bam_file for item in sublist]
148 args.group = [item for sublist in args.group for item in sublist]
149 args.treatment = [item for sublist in args.treatment for item in sublist]
150 args.sample_name = [item for sublist in args.sample_name for item in sublist]
151 args.file_type = [item for sublist in args.file_type for item in sublist]
152 #print(args.sample_name)
153
154 if (len(args.sample_name) != len(set(args.sample_name))):
155 parser.error("Sample names must be unique.")
156
157 if not(len(args.bam_file) == len(args.group) and len(args.group) == len(args.treatment) and len(args.treatment) == len(args.sample_name) and len(args.sample_name) == len(args.file_type)):
158 parser.error("Number of total BAM files, groups, treatments, sample names, and types must be the same.")
159
160 file_counts = dict()
161 file_unique_counts = dict()
162 file_nonunique_counts = dict()
163 all_ids = list()
164
165 ## do a pre-run check that all files exist
166 for full_filename in args.bam_file:
167 if not path.exists(full_filename):
168 parser.error("file '%s' does not exist" % full_filename)
169
170 outf = open(args.out_file, "w")
171 outf.write("#")
172 for (g,t) in zip(args.group,args.treatment):
173 outf.write("\t" + g + ":" + t)
174 outf.write("\n#Feature")
175 for s in args.sample_name:
176 outf.write("\t" + s)
177 outf.write("\n")
178 outf.close()
179
180 for (full_filename, sn, ft) in zip(args.bam_file, args.sample_name, args.file_type):
181 ## read in SAM file, extract counts, and unpack counts
182 tmp = SAM_file_to_counts(full_filename, sn, ftype=ft, extra=args.extra_output,
183 use_all_references=args.use_all_references)
184
185 if args.extra_output:
186 counts, unique, nonunique = tmp['counts'], tmp['unique'], tmp['nonunique']
187 else:
188 counts = tmp['counts']
189
190 ## save counts, and unique/non-unique counts
191 file_counts[sn] = counts
192
193 if args.extra_output:
194 file_unique_counts[sn] = unique
195 file_nonunique_counts[sn] = nonunique
196
197 ## add all ids encountered in this in this file
198 all_ids.extend(file_counts[sn].keys())
199
200 ## Uniquify all_ids, and then take the nested file_counts
201 ## dictionary, collapse, and write to file.
202 all_ids = set(all_ids)
203 table_dict = collapsed_nested_count_dict(file_counts, all_ids, order=args.sample_name)
204 counts_to_file(table_dict, args.out_file, delimiter=args.delimiter)
205
206 if args.extra_output:
207 unique_fn, nonunique_fn = args.extra_out_files.split(',')
208 unique_table_dict = collapsed_nested_count_dict(file_unique_counts, all_ids, order=files)
209 nonunique_table_dict = collapsed_nested_count_dict(file_nonunique_counts, all_ids, order=files)
210
211 counts_to_file(unique_table_dict, unique_fn, delimiter=args.delimiter)
212 counts_to_file(nonunique_table_dict, nonunique_fn, delimiter=args.delimiter)
213