Mercurial > repos > erasmus-medical-center > mycrobiota

diff mycrobiota.py @ 0:607c5e7e0a64 draft
planemo upload for repository https://github.com/ErasmusMC-Bioinformatics/galaxytools-emc/tree/master/tools/mycrobiota commit 1c4c58018b64ff3531a719e789ce71cb0a1244c5
author: erasmus-medical-center
date: Wed, 13 Dec 2017 10:09:50 -0500
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mycrobiota.py	Wed Dec 13 10:09:50 2017 -0500
@@ -0,0 +1,553 @@
+import argparse
+import csv
+import math
+import os
+from subprocess import call
+
+
+def main():
+    print("Welcome to the MYcrobiota suite")
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-c', '--command', required=True,
+                        help="What action to perform")
+    parser.add_argument('-ct', '--count_table', action='append',
+                        help="mothur count table")
+    parser.add_argument('-cp', '--copies', help="copies of NC for samples")
+    parser.add_argument('-nccp', '--nc_copies', help="copies of NC for itself")
+    parser.add_argument('-t', '--taxonomy', action='append',
+                        help="mothur taxonomy file")
+    parser.add_argument('-s', '--shared_file', action='append',
+                        help="mothur shared file")
+    parser.add_argument('-otu', '--otutable', action='append',
+                        help="mothur OTU table")
+    parser.add_argument('-f', '--fasta', action='append', help="fasta")
+    parser.add_argument('-sl', '--summary_log', action='append',
+                        help="mothur summary log file")
+    parser.add_argument('-o', '--outfile', help="output file")
+    parser.add_argument('-od', '--outdir', help="output directory", default="")
+    parser.add_argument('-lv', '--level', help="taxonomy level")
+    parser.add_argument('-nc', '--negative_control',
+                        help="sample name of the negative control")
+    parser.add_argument('-ncs', '--negative_control_species',
+                        help="species name of the negative control",
+                        default="Oscillatoria")
+    parser.add_argument('-r', '--replicate_suffix',
+                        help="suffix to identify replicates")
+    parser.add_argument('-l', '--label', action='append',
+                        help="label for count table")
+    parser.add_argument('--with-otu', dest='with_otu', action='store_true',
+                        default=False)
+    args = parser.parse_args()
+
+    try:
+        os.mkdir(args.outdir)
+    except OSError:
+        pass
+
+    print("Running command: "+args.command)
+
+    if args.command == 'counttable_totals':
+        count_table_totals(args.count_table[0], args.outdir, args.outfile)
+
+    elif args.command == 'qc_report':
+        if args.count_table:
+            qc_report(args.count_table, args.label, 'counttables', args.outdir)
+        elif args.summary_log:
+            qc_report(args.summary_log, args.label, 'summarylogs', args.outdir)
+
+    elif args.command == 'create_krona_plot':
+        create_krona_plot(args.taxonomy, args.outdir, args.with_otu)
+
+    elif args.command == 'create_krona_plot_multisample':
+        create_krona_plot_multisample(args.taxonomy, args.shared_file,
+                                      args.level, args.outdir, args.with_otu)
+
+    elif args.command == 'correct_replicates':
+        correct_replicates(args.shared_file, args.taxonomy, args.outdir,
+                           args.replicate_suffix, args.copies,
+                           args.negative_control, args.nc_copies,
+                           args.negative_control_species)
+
+    elif args.command == 'make_multi_otutable':
+        make_multi_otutable(args.taxonomy, args.shared_file, args.level,
+                            args.outdir)
+
+    elif args.command == 'otutable_add_blast_links':
+        otutable_add_blast_links(args.otutable, args.fasta)
+
+    elif args.command == 'split_multi_otutable':
+        split_multi_otutable(args.otutable)
+
+    else:
+        print("unknown command. exiting")
+
+
+def make_url(seq, baseurl):
+    return baseurl+"?DATABASE=nr&PERC_IDENT=97&EXCLUDE_SEQ_UNCULT=on&" \
+                   "HITLIST_SIZE=10&FILTER=L&FILTER=m&FILTER=R&EXPECT=10&" \
+                   "FORMAT_TYPE=HTML&PROGRAM=blastn&CLIENT=web&" \
+                   "SERVICE=megablast&PAGE=Nucleotides&CMD=Put&QUERY=" \
+                  + seq.lower()
+
+
+def make_RIDlink(RID, baseurl):
+    return "<a target=\"_blank\" href=\""+baseurl+"?CMD=Get&RID="\
+           + RID + "\">view results</a>"
+
+
+def make_rerun_link(seq, baseurl):
+    return "<a target=\"_blank\" href=\"" + baseurl +\
+           "?DATABASE=nr&EXCLUDE_SEQ_UNCULT=yes&FILTER=L&FORMAT_TYPE=HTML" \
+           "&PROGRAM=blastn&CLIENT=web&SERVICE=megablast&PAGE=Nucleotides&" \
+           "CMD=Web&QUERY=" + seq.lower() + "\">send to BLAST</a>"
+
+
+def otutable_add_blast_links(otutable, otureps):
+    baseurl = "http://www.ncbi.nlm.nih.gov/blast/Blast.cgi"
+
+    # for each otu create blast search of corresponding representative sequence
+    reps = [line for line in open(otureps[0], "r")]
+
+    seqs = [r.rstrip('\n').replace('-', '') for r in reps if '>' not in r]
+    seq_names = [r for r in reps if '>' in r]
+    otulines = [line for line in open(otutable[0], "r")]
+
+    # Add RID link and rerun link to table
+    with open("otutable_with_blast.tsv", "w+") as outfile, \
+            open("filtered_otureps.fasta", "w+") as repsout:
+        outfile.write(otulines[0].rstrip() + "\tBLAST\n")
+
+        for otuline in otulines[1:]:
+            otu = otuline.split('\t')[0]
+            for i, seq in enumerate(seq_names):
+                if otu in seq:
+                    outfile.write(otuline.rstrip() + "\t" +
+                                  make_rerun_link(seqs[i], baseurl) + "\n")
+            # output otureps for these otus
+            for i, seq in enumerate(reps):
+                if otu in seq:
+                    repsout.write(reps[i])
+                    repsout.write(reps[i+1])
+
+
+def summarylog_total(infile):
+    with open(infile) as f:
+        summarylog = f.readlines()
+        for line in summarylog:
+            if line.startswith('# of Seqs:') \
+                    or line.startswith('total # of seqs:'):
+                return int(line.split('\t')[-1])
+    return None
+
+
+def count_table_totals(infile, outdir='', outfile=''):
+    """
+    Given a Mothur counttable, calculate the total number of sequences for each
+    sample. This can be appended as additional row in the count table by
+    providing a file name.
+
+    :param infile: Mothur count table.
+    :param outfile: Optional. Write the count table with an additional row with
+                    totals to this file
+    :param outdir: Optional. Write output do this directory
+    :return: A list with totals for all columns (samples) in the count table
+    """
+    # assume a single input file for now
+    out_rows = []
+    with open(infile) as f:
+        count_table = csv.reader(f, delimiter='\t')
+
+        header = next(count_table)
+        totals = [0] * (len(header)-1)
+
+        out_rows.append(header)
+        for row in count_table:
+            if row[0] != 'total':
+                out_rows.append(row)
+                for i in range(1, len(row)):
+                    totals[i-1] += int(row[i])
+
+        out_rows.append(["total"] + map(str, totals))
+
+    # write count table with totals to file if requested
+    if outfile:
+        write_output(outdir, outfile, out_rows)
+
+    return totals
+
+
+def qc_report(infiles, label, inputtype, outdir=''):
+    """
+    Construct QC table from multiple count files
+      - Report the number of sequences lost at each consecutive QC step
+      - Create a multi-sample report and a separate report for each sample
+
+    :param infiles: set of count tables
+    :param label: labels for each step
+    :param outdir: directory to place output files. Default: cwd
+    :return:
+    """
+    assert len(infiles) == len(label), \
+        "number of files and labels unequal, stopping"
+
+    print("qcreport")
+    previous_totals = []
+    outlines = []
+    lasttotal = None
+
+    for (i, lab) in zip(infiles, label):
+        with open(i, 'rb') as f:
+            count_file = csv.reader(f, delimiter='\t')
+
+            if inputtype == 'summarylogs':
+                print("summarylogs")
+                if not outlines:
+                    outlines = [['Step', 'Total', 'Removed', 'Percentage']]
+
+                # get total count
+                total = summarylog_total(i)
+
+                # calculate difference with last
+                if not lasttotal:
+                    outlines.append([lab, total, None, None])
+                else:
+                    diff = total - lasttotal
+                    perc = float(diff)/float(lasttotal)*100.0
+                    outlines.append([lab, total, diff, str("%.1f" % perc)+"%"])
+
+                lasttotal = total
+
+            else:
+                # add header line to output
+                if not outlines:
+                    outlines = [['step'] + next(count_file)[1:]]
+
+                # calculate totals of each column in the count file
+                totals = count_table_totals(i)
+
+                # calculate difference with previous count file
+                if not previous_totals:
+                    diffs = [""] * len(totals)
+                else:
+                    diffs = [" (" + str(t1-t2)+"; "
+                             + str("%.1f" % (float(t1-t2)/float(t2)*100.0)) +
+                             "%)" for t1, t2 in zip(totals, previous_totals)]
+
+                outlines.append([lab] +
+                                [str(a)+b for a, b in zip(totals, diffs)])
+                previous_totals = totals
+
+            # write multi-sample output file
+            write_output(outdir, 'all_qctable.tsv', outlines)
+
+            # write per-sample output files
+            for j in range(2, len(outlines[0])):
+                sample = outlines[0][j]
+                sample_outlines = [[outlines_line[0], outlines_line[j]]
+                                   for outlines_line in outlines]
+                write_output(outdir, 'persample_qctable_'+sample+'.tsv',
+                             sample_outlines)
+
+
+def column(matrix, i):
+    return [row[i] for row in matrix]
+
+
+def mean(data):
+    return sum(data) / float(len(data))
+
+
+def stdev(data):
+    c = mean(data)
+    ss = sum((x-c)**2 for x in data)
+    n = len(data)
+    return math.sqrt(ss/(n-1))
+
+
+def write_output(outdir, filename, outlines):
+    with open(os.path.join(outdir, filename), 'wb') as of:
+        out_table = csv.writer(of, delimiter='\t', lineterminator='\n')
+        for row in outlines:
+            out_table.writerow(row)
+
+
+def correct_replicates(shared, taxonomy, outdir, replicate_suffix,
+                       sample_copies, negative_control='', nc_copies=-1,
+                       negative_control_species='Oscillatoria'):
+    with open(shared[0], 'rb') as f, open(taxonomy[0], 'rb') as f2:
+        shared_file = csv.reader(f, delimiter='\t')
+        taxonomy_file = csv.reader(f2, delimiter='\t')
+
+        # determine which OTU number is the control, Oscillatoria by default
+        # (Bacteria;Cyanobacteria;Cyanobacteria;..;Oscillatoria)
+        try:
+            line = next(taxonomy_file)
+            while negative_control_species not in line[2]:
+                line = next(taxonomy_file)
+            otu = line[0]
+        except StopIteration:
+            print("negative control species not found in taxonomy, Exiting")
+            return 1
+
+        ''' Calculate Copies '''
+        # per replicate of sample and NC, determine correction factor,
+        # (number Oscillatoria seqs/known copies of it)
+        # correct all sequence counts with that
+        myshared = [row for row in shared_file if row]
+        newshared = [myshared[0]]
+        newshared2 = [myshared[0]]
+        newshared3 = [myshared[0]]
+        newshared4 = [myshared[0]]
+        oscil_column = myshared[0].index(otu)
+
+        for row in myshared[1:]:
+            if row[1].startswith(negative_control):
+                copies = nc_copies
+            else:
+                copies = sample_copies
+
+            correction_factor = float(row[oscil_column]) / float(copies)
+
+            new_row = row[0:3]
+            for count in row[3:]:
+                try:
+                    newval = float(count) / correction_factor
+                except ZeroDivisionError:
+                    newval = float(count)
+                new_row.append(newval)
+            newshared.append(new_row)
+
+        ''' Average copy counts across replicates  '''
+        levels = set([row[0] for row in newshared[1:]])
+        samples = set([row[1].split(replicate_suffix)[0]
+                       for row in newshared[1:]])
+
+        for level in levels:
+            for sample in samples:
+                neg = True if sample.startswith(negative_control) else False
+                replicates = [row for row in newshared if row[0] == level
+                              and row[1].startswith(sample)]
+                num_otus = int(replicates[0][2])+3
+                total = replicates[0][2]
+                avg = [level, sample, total]
+
+                for i in range(3, num_otus):
+                    counts = column(replicates, i)
+                    avg.append(mean(counts)) if 0 not in counts or neg \
+                        else avg.append(0)
+
+                newshared2.append(avg)
+
+        ''' Average *sequence* counts across replicates  '''
+        levels = set([row[0] for row in myshared[1:]])
+        samples = set([row[1].split(replicate_suffix)[0]
+                       for row in myshared[1:]])
+
+        for level in levels:
+            for sample in samples:
+                replicates = [row for row in myshared if row[0] == level
+                              and row[1].startswith(sample)]
+                num_otus = int(replicates[0][2]) + 3
+                total = replicates[0][2]
+                avg = [level, sample, total]
+
+                for i in range(3, num_otus):
+                    counts = map(int, column(replicates, i))
+                    avg.append(int(round(mean(counts))))
+
+                newshared4.append(avg)
+
+        ''' Correct for background '''
+        # for each otu, subtract 3 times the standard deviation of
+        # the negative control sample
+        for level in levels:
+            NC = [row for row in newshared if row[0] == level
+                  and row[1].startswith(negative_control)]
+            samples = [row for row in newshared2 if row[0] == level
+                       and not row[1].startswith(negative_control)]
+            num_otus = int(samples[0][2])+3
+
+            for i in range(3, num_otus):
+                m = mean(column(NC, i))
+                sd = stdev(column(NC, i))
+                corr = m + 3*sd
+
+                for s in samples:
+                    s[i] = max(0, int(round(s[i] - corr)))
+
+            newshared3 += samples
+
+        # remove Negative control species otu from table
+        for i, row in enumerate(newshared3):
+            del row[oscil_column]
+            if i > 0:
+                row[2] = int(row[2]) - 1
+
+        # sort file or other mothur tools may segfault :/
+        newshared3 = [newshared3[0]] + sorted(newshared3[1:],
+                                              key=lambda a_entry: a_entry[0]
+                                              if a_entry[0] != 'unique' else 0)
+
+        f2.seek(0)
+        taxonomy_out = [row for row in taxonomy_file if row and row[0] != otu]
+        write_output(outdir, 'taxonomy_corrected.tsv', taxonomy_out)
+        write_output(outdir, 'shared_corrected.tsv', newshared3)
+        write_output(outdir, 'shared_averaged.tsv', newshared4)
+
+
+def make_multi_otutable(taxonomy_file, shared_file, level, outdir):
+    """
+    Create an otu table from shared file and taxonomy file
+
+    example output:
+
+    OTU    sample1 sample2 .. sampleX Kingdom  Phylum  Class Order Family Genus
+    Otu001 13      8       .. 91      Bacteria Bacteroidetes Bacteroidia ..
+    ...
+
+    :param taxonomy_file:
+    :param shared_file:
+    :param level:
+
+    :return:
+    """
+
+    outlines = []
+    samples = []
+    # create multisample taxonomy from counts in shared file
+    with open(taxonomy_file[0], 'r') as tax, open(shared_file[0]) as sh:
+        taxonomy = csv.reader(tax, delimiter='\t')
+        shared = csv.reader(sh, delimiter='\t')
+        shared_header = next(shared)
+        outlines.append(shared_header[3:])
+
+        # get all taxonomies
+        taxonomies = []
+        for j, t in enumerate(taxonomy):
+            if j > 0:
+                taxonomies.append(filter(None, [x.split('(')[0]
+                                                for x in t[2].split(';')]))
+
+        for i, row in enumerate(shared):
+            tax.seek(0)  # make sure to start at beginning of file each time
+            if row[0] == level:
+                samples.append(row[1])
+                outlines.append(row[3:])
+
+        transposed = map(list, zip(*outlines))
+        header = ["OTU"] + samples + ["Kingdom", "Phylum", "Class", "Order",
+                                      "Family", "Genus"]
+
+        writelines = [header] + [a + b for a, b in zip(transposed, taxonomies)
+                                 if a[1:] != ['0'] * len(a[1:])]
+
+        # sort sample columns by name
+        lst = map(list, zip(*[w[1:-6] for w in writelines]))
+        lst.sort(key=lambda x: x[0])
+        lst = map(list, zip(*lst))
+        writelines2 = [[writelines[i][0]] + lst[i] + writelines[i][-6:]
+                       for i in range(0, len(writelines))]
+
+        # output corrected shared file
+        write_output(outdir, "multi_otutable.tsv", writelines2)
+
+
+def split_multi_otutable(otutable, with_avg=True):
+    fulltable = [line.strip().split('\t')
+                 for line in open(otutable[0], 'r') if line]
+    samples = [s.split('_')[0] for s in fulltable[0][1:-6]]
+    numcols = len(fulltable[0])
+    numreplicates = (numcols - 7) / len(set(samples))
+
+    for sample in set(samples):
+        outlines = []
+        cols = [0] + [i+1 for i, s in enumerate(samples) if sample in s] \
+            + [i for i in range(numcols-6, numcols)]
+        for i, line in enumerate(fulltable):
+            out = [line[j] for j in cols]
+            if out[1:-6] != ['0'] * numreplicates:
+                out.insert(-6, 'mean' if i == 0
+                           else int(round(mean(map(int, out[1:-6])))))
+                outlines.append(out)
+
+        write_output('.', sample+'.otutable', outlines)
+
+
+def create_krona_plot_multisample(taxonomy_file, shared_file, level, outdir,
+                                  with_otu):
+    """
+    Create krona plots from a multisample taxonomy plot and a shared file.
+    Create one multisample plot and a plot per individual sample
+
+    :param taxonomy_file:
+    :param shared_file:
+    :param level: which level to use, e.g. unique/0.03/..
+    :param with_otu:
+    :return:
+    """
+
+    taxonomies = []
+
+    # create taxonomy file per sample
+    with open(taxonomy_file[0], 'r') as tax, open(shared_file[0]) as sh:
+        taxonomy = csv.reader(tax, delimiter='\t')
+        shared = csv.reader(sh, delimiter='\t')
+        shared_header = next(shared)
+
+        for i, row in enumerate(shared):
+            tax.seek(0)  # make sure to start at beginning of file each time
+            if row[0] == level:
+                sample = row[1]
+
+                outfile = os.path.join(outdir, sample+".tsv")
+                taxonomies.append(outfile)
+                with open(outfile, 'w+') as of:
+                    out_table = csv.writer(of, delimiter='\t')
+                    out_table.writerow(next(taxonomy))  # header line
+                    for j, t in enumerate(taxonomy):
+                        assert t[0] == shared_header[j+3], \
+                            "OTU mismatch between taxonomy and shared file"
+                        t[1] = row[j+3]
+                        out_table.writerow(t + [shared_header[j+3]])
+
+    # make krona plot
+    create_krona_plot(taxonomies, outdir, with_otu)
+
+
+def create_krona_plot(taxonomy_files, outdir, with_otu):
+    """
+    Create a krona plot from one or more mothur taxonomy files
+
+    :param taxonomy_files: mothur taxonomy file (output from classify.otu)
+    :param outdir: directory to store krona-formatted outputs. Default=cwd
+    :param with_otu: add OTU number as a level in the Krona plot? Default=True
+    :return:
+    """
+    krona_input_files = []
+
+    # convert taxonomy files to krona input.
+    for tax in taxonomy_files:
+        with open(tax, 'r') as f:
+            taxonomy = csv.reader(f, delimiter='\t')
+            out_rows = []
+
+            next(taxonomy)  # skip header line
+            for row in taxonomy:
+                out_rows.append(
+                    filter(None, [row[1]] + row[2].rstrip(";\n").split(';') +
+                           [row[0] if with_otu else None]))
+
+        outfile = os.path.join(outdir, tax.split("/")[-1]+"krona")
+        krona_input_files.append(outfile)
+
+        with open(outfile, 'w+') as f2:
+            out_table = csv.writer(f2, delimiter='\t')
+            for row in out_rows:
+                out_table.writerow(row)
+
+    # execute krona command
+    call(["ktImportText"] + krona_input_files)
+
+
+if __name__ == "__main__":
+    main()
author	erasmus-medical-center
date	Wed, 13 Dec 2017 10:09:50 -0500
parents
children