Next changeset 1:a2ef5d59bd6e (2015-01-22) |
Commit message:
Uploaded |
added:
sam2counts_galaxy_edger.py |
b |
diff -r 000000000000 -r ce3a667012c2 sam2counts_galaxy_edger.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sam2counts_galaxy_edger.py Thu Jan 22 03:54:22 2015 -0500 |
[ |
b'@@ -0,0 +1,213 @@\n+#!/usr/bin/python\n+\n+"""\n+sam2count_galaxy_edger.py -- Take SAM files and output a table of counts with column\n+names that are the filenames, and rowname that are the reference\n+names.\n+"""\n+\n+VERSION = 0.90\n+\n+import sys\n+import csv\n+from os import path\n+try:\n+ import pysam\n+except ImportError:\n+ sys.exit("pysam not installed; please install it\\n")\n+\n+import argparse\n+\n+def SAM_file_to_counts(filename, sname, ftype="sam", extra=False, use_all_references=True):\n+ """\n+ Take SAM filename, and create a hash of mapped and unmapped reads;\n+ keys are reference sequences, values are the counts of occurences.\n+\n+ Also, a hash of qualities (either 0 or >0) of mapped reads\n+ is output, which is handy for diagnostics.\n+ """\n+ counts = dict()\n+ unique = dict()\n+ nonunique = dict()\n+ mode = \'r\'\n+ if ftype == "bam":\n+ mode = \'rb\'\n+ sf = pysam.Samfile(filename, mode)\n+\n+ if use_all_references:\n+ # Make dictionary of all entries in header\n+ try:\n+ for sn in sf.header[\'SQ\']:\n+ if extra:\n+ unique[sn[\'SN\']] = 0\n+ nonunique[sn[\'SN\']] = 0\n+ counts[sn[\'SN\']] = 0\n+ except KeyError:\n+ print "Sample file of sample " + sname + " does not have header."\n+\n+ for read in sf:\n+ if not read.is_unmapped:\n+ id_name = sf.getrname(read.rname) if read.rname != -1 else 0\n+\n+ if not use_all_references and not counts.get(id_name, False):\n+ ## Only make keys based on aligning reads, make empty hash\n+ if extra:\n+ unique[id_name] = 0\n+ nonunique[id_name] = 0\n+ ## initiate entry; even if not mapped, record 0 count\n+ counts[id_name] = counts.get(id_name, 0)\n+ \n+ \n+ counts[id_name] = counts.get(id_name, 0) + 1\n+\n+ if extra:\n+ if read.mapq == 0:\n+ nonunique[id_name] = nonunique[id_name] + 1\n+ else:\n+ unique[id_name] = unique[id_name] + 1\n+\n+ if extra:\n+ return {\'counts\':counts, \'unique\':unique, \'nonunique\':nonunique}\n+\n+ return {\'counts\':counts}\n+\n+def collapsed_nested_count_dict(counts_dict, all_ids, order=None):\n+ """\n+ Takes a nested dictionary `counts_dict` and `all_ids`, which is\n+ built with the `table_dict`. All files (first keys) in\n+ `counts_dict` are made into columns with order specified by\n+ `order`.\n+\n+ Output is a dictionary with keys that are the id\'s (genes or\n+ transcripts), with values that are ordered counts. A header will\n+ be created on the first row from the ordered columns (extracted\n+ from filenames).\n+ """\n+ if order is None:\n+ col_order = counts_dict.keys()\n+ else:\n+ col_order = order\n+\n+ collapsed_dict = dict()\n+ for i, filename in enumerate(col_order):\n+ for id_name in all_ids:\n+ if not collapsed_dict.get(id_name, False):\n+ collapsed_dict[id_name] = list()\n+\n+ # get counts and append\n+ c = counts_dict[filename].get(id_name, 0)\n+ collapsed_dict[id_name].append(c)\n+ return {\'table\':collapsed_dict, \'header\':col_order}\n+\n+\n+def counts_to_file(table_dict, outfilename, delimiter=\',\'):\n+ """\n+ A function for its side-effect of writing `table_dict` (which\n+ contains a table and header), to `outfilename` with the specified\n+ `delimiter`.\n+ """\n+ writer = csv.writer(open(outfilename, \'a\'), delimiter=delimiter, lineterminator=\'\\n\')\n+ table = table_dict[\'table\']\n+ header = table_dict[\'header\']\n+ \n+ #header_row = True\n+ for id_name, fields in table.items():\n+ #if header_row:\n+ #row = [\'id\'] + header\n+ #writer.writerow(row)\n+ #header_row = False\n+\n+ if id_name == 0:\n+ continue\n+ row = [id_name]\n+ row.extend'..b't("-f", "--extra-out-files", dest="extra_out_files",\n+ help="comma-delimited filenames of unique and non-unique output "\n+ "(default: unique.txt,nonunique.txt)",\n+ default=\'unique.txt,nonunique.txt\')\n+ parser.add_argument("-v", "--verbose", dest="verbose",\n+ help="enable verbose output")\n+ parser.add_argument("--bam-file", help="bam file", nargs="+", action="append", required=True)\n+ parser.add_argument("--group", help="group", nargs="+", action="append", required=True)\n+ parser.add_argument("--treatment", help="treatment", nargs="+", action="append", required=True)\n+ parser.add_argument("--sample-name", help="sample name", nargs="+", action="append", required=True)\n+ parser.add_argument("--file-type", help="file type", nargs="+", action="append", required=True, choices=[\'sam\',\'bam\'])\n+ args = parser.parse_args()\n+\n+ args.bam_file = [item for sublist in args.bam_file for item in sublist]\n+ args.group = [item for sublist in args.group for item in sublist]\n+ args.treatment = [item for sublist in args.treatment for item in sublist]\n+ args.sample_name = [item for sublist in args.sample_name for item in sublist]\n+ args.file_type = [item for sublist in args.file_type for item in sublist]\n+ #print(args.sample_name)\n+\n+ if (len(args.sample_name) != len(set(args.sample_name))):\n+ parser.error("Sample names must be unique.")\n+\n+ if not(len(args.bam_file) == len(args.group) and len(args.group) == len(args.treatment) and len(args.treatment) == len(args.sample_name) and len(args.sample_name) == len(args.file_type)):\n+ parser.error("Number of total BAM files, groups, treatments, sample names, and types must be the same.")\n+\n+ file_counts = dict()\n+ file_unique_counts = dict()\n+ file_nonunique_counts = dict()\n+ all_ids = list()\n+\n+ ## do a pre-run check that all files exist\n+ for full_filename in args.bam_file:\n+ if not path.exists(full_filename):\n+ parser.error("file \'%s\' does not exist" % full_filename)\n+\n+ outf = open(args.out_file, "w")\n+ outf.write("#")\n+ for (g,t) in zip(args.group,args.treatment):\n+ outf.write("\\t" + g + ":" + t) \n+ outf.write("\\n#Feature")\n+ for s in args.sample_name:\n+ outf.write("\\t" + s)\n+ outf.write("\\n")\n+ outf.close()\n+\n+ for (full_filename, sn, ft) in zip(args.bam_file, args.sample_name, args.file_type):\n+ ## read in SAM file, extract counts, and unpack counts\n+ tmp = SAM_file_to_counts(full_filename, sn, ftype=ft, extra=args.extra_output,\n+ use_all_references=args.use_all_references)\n+\n+ if args.extra_output:\n+ counts, unique, nonunique = tmp[\'counts\'], tmp[\'unique\'], tmp[\'nonunique\']\n+ else:\n+ counts = tmp[\'counts\']\n+\n+ ## save counts, and unique/non-unique counts\n+ file_counts[sn] = counts\n+\n+ if args.extra_output:\n+ file_unique_counts[sn] = unique\n+ file_nonunique_counts[sn] = nonunique\n+\n+ ## add all ids encountered in this in this file\n+ all_ids.extend(file_counts[sn].keys())\n+\n+ ## Uniquify all_ids, and then take the nested file_counts\n+ ## dictionary, collapse, and write to file.\n+ all_ids = set(all_ids)\n+ table_dict = collapsed_nested_count_dict(file_counts, all_ids, order=args.sample_name)\n+ counts_to_file(table_dict, args.out_file, delimiter=args.delimiter)\n+\n+ if args.extra_output:\n+ unique_fn, nonunique_fn = args.extra_out_files.split(\',\')\n+ unique_table_dict = collapsed_nested_count_dict(file_unique_counts, all_ids, order=files)\n+ nonunique_table_dict = collapsed_nested_count_dict(file_nonunique_counts, all_ids, order=files)\n+ \n+ counts_to_file(unique_table_dict, unique_fn, delimiter=args.delimiter)\n+ counts_to_file(nonunique_table_dict, nonunique_fn, delimiter=args.delimiter)\n+ \n' |