Repository 'sam2counts_edger'
hg clone https://toolshed.g2.bx.psu.edu/repos/nikhil-joshi/sam2counts_edger

Changeset 0:ce3a667012c2 (2015-01-22)
Next changeset 1:a2ef5d59bd6e (2015-01-22)
Commit message:
Uploaded
added:
sam2counts_galaxy_edger.py
b
diff -r 000000000000 -r ce3a667012c2 sam2counts_galaxy_edger.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sam2counts_galaxy_edger.py Thu Jan 22 03:54:22 2015 -0500
[
b'@@ -0,0 +1,213 @@\n+#!/usr/bin/python\n+\n+"""\n+sam2count_galaxy_edger.py -- Take SAM files and output a table of counts with column\n+names that are the filenames, and rowname that are the reference\n+names.\n+"""\n+\n+VERSION = 0.90\n+\n+import sys\n+import csv\n+from os import path\n+try:\n+    import pysam\n+except ImportError:\n+    sys.exit("pysam not installed; please install it\\n")\n+\n+import argparse\n+\n+def SAM_file_to_counts(filename, sname, ftype="sam", extra=False, use_all_references=True):\n+    """\n+    Take SAM filename, and create a hash of mapped and unmapped reads;\n+    keys are reference sequences, values are the counts of occurences.\n+\n+    Also, a hash of qualities (either 0 or >0) of mapped reads\n+    is output, which is handy for diagnostics.\n+    """\n+    counts = dict()\n+    unique = dict()\n+    nonunique = dict()\n+    mode = \'r\'\n+    if ftype == "bam":\n+        mode = \'rb\'\n+    sf = pysam.Samfile(filename, mode)\n+\n+    if use_all_references:\n+        # Make dictionary of all entries in header\n+        try:\n+            for sn in sf.header[\'SQ\']:\n+                if extra:\n+                    unique[sn[\'SN\']] = 0\n+                    nonunique[sn[\'SN\']] = 0\n+                counts[sn[\'SN\']] = 0\n+        except KeyError:\n+            print "Sample file of sample " + sname + " does not have header."\n+\n+    for read in sf:\n+        if not read.is_unmapped:\n+            id_name = sf.getrname(read.rname) if read.rname != -1 else 0\n+\n+            if not use_all_references and not counts.get(id_name, False):\n+                ## Only make keys based on aligning reads, make empty hash\n+                if extra:\n+                    unique[id_name] = 0\n+                    nonunique[id_name] = 0\n+                ## initiate entry; even if not mapped, record 0 count\n+                counts[id_name] = counts.get(id_name, 0)\n+        \n+            \n+            counts[id_name] = counts.get(id_name, 0) + 1\n+\n+            if extra:\n+                if read.mapq == 0:\n+                    nonunique[id_name] = nonunique[id_name] + 1\n+                else:\n+                    unique[id_name] = unique[id_name] + 1\n+\n+    if extra:\n+        return {\'counts\':counts, \'unique\':unique, \'nonunique\':nonunique}\n+\n+    return {\'counts\':counts}\n+\n+def collapsed_nested_count_dict(counts_dict, all_ids, order=None):\n+    """\n+    Takes a nested dictionary `counts_dict` and `all_ids`, which is\n+    built with the `table_dict`. All files (first keys) in\n+    `counts_dict` are made into columns with order specified by\n+    `order`.\n+\n+    Output is a dictionary with keys that are the id\'s (genes or\n+    transcripts), with values that are ordered counts. A header will\n+    be created on the first row from the ordered columns (extracted\n+    from filenames).\n+    """\n+    if order is None:\n+        col_order = counts_dict.keys()\n+    else:\n+        col_order = order\n+\n+    collapsed_dict = dict()\n+    for i, filename in enumerate(col_order):\n+        for id_name in all_ids:\n+            if not collapsed_dict.get(id_name, False):\n+                collapsed_dict[id_name] = list()\n+\n+            # get counts and append\n+            c = counts_dict[filename].get(id_name, 0)\n+            collapsed_dict[id_name].append(c)\n+    return {\'table\':collapsed_dict, \'header\':col_order}\n+\n+\n+def counts_to_file(table_dict, outfilename, delimiter=\',\'):\n+    """\n+    A function for its side-effect of writing `table_dict` (which\n+    contains a table and header), to `outfilename` with the specified\n+    `delimiter`.\n+    """\n+    writer = csv.writer(open(outfilename, \'a\'), delimiter=delimiter, lineterminator=\'\\n\')\n+    table = table_dict[\'table\']\n+    header = table_dict[\'header\']\n+    \n+    #header_row = True\n+    for id_name, fields in table.items():\n+        #if header_row:\n+            #row = [\'id\'] + header\n+            #writer.writerow(row)\n+            #header_row = False\n+\n+        if id_name == 0:\n+            continue\n+        row = [id_name]\n+        row.extend'..b't("-f", "--extra-out-files", dest="extra_out_files",\n+                      help="comma-delimited filenames of unique and non-unique output "\n+                      "(default: unique.txt,nonunique.txt)",\n+                      default=\'unique.txt,nonunique.txt\')\n+    parser.add_argument("-v", "--verbose", dest="verbose",\n+                      help="enable verbose output")\n+    parser.add_argument("--bam-file", help="bam file", nargs="+", action="append", required=True)\n+    parser.add_argument("--group", help="group", nargs="+", action="append", required=True)\n+    parser.add_argument("--treatment", help="treatment", nargs="+", action="append", required=True)\n+    parser.add_argument("--sample-name", help="sample name", nargs="+", action="append", required=True)\n+    parser.add_argument("--file-type", help="file type", nargs="+", action="append", required=True, choices=[\'sam\',\'bam\'])\n+    args = parser.parse_args()\n+\n+    args.bam_file = [item for sublist in args.bam_file for item in sublist]\n+    args.group = [item for sublist in args.group for item in sublist]\n+    args.treatment = [item for sublist in args.treatment for item in sublist]\n+    args.sample_name = [item for sublist in args.sample_name for item in sublist]\n+    args.file_type = [item for sublist in args.file_type for item in sublist]\n+    #print(args.sample_name)\n+\n+    if (len(args.sample_name) != len(set(args.sample_name))):\n+        parser.error("Sample names must be unique.")\n+\n+    if not(len(args.bam_file) == len(args.group) and len(args.group) == len(args.treatment) and len(args.treatment) == len(args.sample_name) and len(args.sample_name) == len(args.file_type)):\n+        parser.error("Number of total BAM files, groups, treatments, sample names, and types must be the same.")\n+\n+    file_counts = dict()\n+    file_unique_counts = dict()\n+    file_nonunique_counts = dict()\n+    all_ids = list()\n+\n+    ## do a pre-run check that all files exist\n+    for full_filename in args.bam_file:\n+        if not path.exists(full_filename):\n+            parser.error("file \'%s\' does not exist" % full_filename)\n+\n+    outf = open(args.out_file, "w")\n+    outf.write("#")\n+    for (g,t) in zip(args.group,args.treatment):\n+        outf.write("\\t" + g + ":" + t) \n+    outf.write("\\n#Feature")\n+    for s in args.sample_name:\n+        outf.write("\\t" + s)\n+    outf.write("\\n")\n+    outf.close()\n+\n+    for (full_filename, sn, ft) in zip(args.bam_file, args.sample_name, args.file_type):\n+        ## read in SAM file, extract counts, and unpack counts\n+        tmp = SAM_file_to_counts(full_filename, sn, ftype=ft, extra=args.extra_output,\n+                                 use_all_references=args.use_all_references)\n+\n+        if args.extra_output:\n+            counts, unique, nonunique = tmp[\'counts\'], tmp[\'unique\'], tmp[\'nonunique\']\n+        else:\n+            counts = tmp[\'counts\']\n+\n+        ## save counts, and unique/non-unique counts\n+        file_counts[sn] = counts\n+\n+        if args.extra_output:\n+            file_unique_counts[sn] = unique\n+            file_nonunique_counts[sn] = nonunique\n+\n+        ## add all ids encountered in this in this file\n+        all_ids.extend(file_counts[sn].keys())\n+\n+    ## Uniquify all_ids, and then take the nested file_counts\n+    ## dictionary, collapse, and write to file.\n+    all_ids = set(all_ids)\n+    table_dict = collapsed_nested_count_dict(file_counts, all_ids, order=args.sample_name)\n+    counts_to_file(table_dict, args.out_file, delimiter=args.delimiter)\n+\n+    if args.extra_output:\n+        unique_fn, nonunique_fn = args.extra_out_files.split(\',\')\n+        unique_table_dict = collapsed_nested_count_dict(file_unique_counts, all_ids, order=files)\n+        nonunique_table_dict = collapsed_nested_count_dict(file_nonunique_counts, all_ids, order=files)\n+        \n+        counts_to_file(unique_table_dict, unique_fn, delimiter=args.delimiter)\n+        counts_to_file(nonunique_table_dict, nonunique_fn, delimiter=args.delimiter)\n+        \n'