diff RepEnrich.py @ 0:f6f0f1e5e940 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit 61e203df0be5ed877ff92b917c7cde6eeeab8310
author artbio
date Wed, 02 Aug 2017 05:17:29 -0400
parents
children 15e3e29f310e
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/RepEnrich.py	Wed Aug 02 05:17:29 2017 -0400
@@ -0,0 +1,502 @@
+#!/usr/bin/env python
+
+import argparse
+import csv
+import os
+import shlex
+
+import shutil
+import subprocess
+import sys
+
+import numpy
+
+
+parser = argparse.ArgumentParser(description='''
+             Part II: Conducting the alignments to the psuedogenomes. Before\
+             doing this step you will require 1) a bamfile of the unique\
+             alignments with index 2) a fastq file of the reads mapping to\
+             more than one location. These files can be obtained using the\
+             following bowtie options [EXAMPLE: bowtie -S -m 1\
+             --max multimap.fastq mm9 mate1_reads.fastq]  Once you have the\
+             unique alignment bamfile and the reads mapping to more than one\
+             location in a fastq file you can run this step.  EXAMPLE: python\
+             master_output.py\
+             /users/nneretti/data/annotation/hg19/hg19_repeatmasker.txt\
+             /users/nneretti/datasets/repeatmapping/POL3/Pol3_human/
+             HeLa_InputChIPseq_Rep1 HeLa_InputChIPseq_Rep1\
+             /users/nneretti/data/annotation/hg19/setup_folder\
+             HeLa_InputChIPseq_Rep1_multimap.fastq\
+             HeLa_InputChIPseq_Rep1.bam''')
+parser.add_argument('--version', action='version', version='%(prog)s 0.1')
+parser.add_argument('annotation_file', action='store',
+                    metavar='annotation_file',
+                    help='List RepeatMasker.org annotation file for your\
+                          organism.  The file may be downloaded from the\
+                          RepeatMasker.org website. Example:\
+                          /data/annotation/hg19/hg19_repeatmasker.txt')
+parser.add_argument('outputfolder', action='store', metavar='outputfolder',
+                    help='List folder to contain results.\
+                          Example: /outputfolder')
+parser.add_argument('outputprefix', action='store', metavar='outputprefix',
+                    help='Enter prefix name for data.\
+                           Example: HeLa_InputChIPseq_Rep1')
+parser.add_argument('setup_folder', action='store', metavar='setup_folder',
+                    help='List folder that contains the repeat element\
+                          pseudogenomes.\
+                          Example: /data/annotation/hg19/setup_folder')
+parser.add_argument('fastqfile', action='store', metavar='fastqfile',
+                    help='Enter file for the fastq reads that map to multiple\
+                          locations. Example: /data/multimap.fastq')
+parser.add_argument('alignment_bam', action='store', metavar='alignment_bam',
+                    help='Enter bamfile output for reads that map uniquely.\
+                    Example /bamfiles/old.bam')
+parser.add_argument('--pairedend', action='store', dest='pairedend',
+                    default='FALSE',
+                    help='Designate this option for paired-end sequencing.\
+                          Default FALSE change to TRUE')
+parser.add_argument('--collapserepeat', action='store', dest='collapserepeat',
+                    metavar='collapserepeat', default='Simple_repeat',
+                    help='Designate this option to generate a collapsed repeat\
+                          type. Uncollapsed output is generated in addition to\
+                          collapsed repeat type. Simple_repeat is default to\
+                          simplify downstream analysis. You can change the\
+                          default to another repeat name to collapse a\
+                          seperate specific repeat instead or if the name of\
+                          Simple_repeat is different for your organism.\
+                          Default Simple_repeat')
+parser.add_argument('--fastqfile2', action='store', dest='fastqfile2',
+                    metavar='fastqfile2', default='none',
+                    help='Enter fastqfile2 when using paired-end option.\
+                          Default none')
+parser.add_argument('--cpus', action='store', dest='cpus', metavar='cpus',
+                    default="1", type=int,
+                    help='Enter available cpus per node.  The more cpus the\
+                          faster RepEnrich performs. RepEnrich is designed to\
+                          only work on one node. Default: "1"')
+parser.add_argument('--allcountmethod', action='store', dest='allcountmethod',
+                    metavar='allcountmethod', default="FALSE",
+                    help='By default the pipeline only outputs the fraction\
+                          count method.  Consdidered to be the best way to\
+                          count multimapped reads. Changing this option will\
+                          include the unique count method, a conservative\
+                          count, and the total count method, a liberal\
+                          counting strategy. Our evaluation of simulated data\
+                          indicated fraction counting is best.\
+                          Default = FALSE, change to TRUE')
+parser.add_argument('--is_bed', action='store', dest='is_bed',
+                    metavar='is_bed', default='FALSE',
+                    help='Is the annotation file a bed file.\
+                         This is also a compatible format. The file needs to\
+                         be a tab seperated bed with optional fields.\
+                         Ex. format: chr\tstart\tend\tName_element\tclass\
+                         \tfamily. The class and family should identical to\
+                         name_element if not applicable.\
+                         Default FALSE change to TRUE')
+args = parser.parse_args()
+
+# parameters
+annotation_file = args.annotation_file
+outputfolder = args.outputfolder
+outputfile_prefix = args.outputprefix
+setup_folder = args.setup_folder
+repeat_bed = setup_folder + os.path.sep + 'repnames.bed'
+unique_mapper_bam = args.alignment_bam
+fastqfile_1 = args.fastqfile
+fastqfile_2 = args.fastqfile2
+cpus = args.cpus
+b_opt = "-k1 -p " + str(1) + " --quiet"
+simple_repeat = args.collapserepeat
+paired_end = args.pairedend
+allcountmethod = args.allcountmethod
+is_bed = args.is_bed
+
+##############################################################################
+# check that the programs we need are available
+try:
+    subprocess.call(shlex.split("coverageBed -h"),
+                    stdout=open(os.devnull, 'wb'),
+                    stderr=open(os.devnull, 'wb'))
+    subprocess.call(shlex.split("bowtie --version"),
+                    stdout=open(os.devnull, 'wb'),
+                    stderr=open(os.devnull, 'wb'))
+except OSError:
+    print("Error: Bowtie or BEDTools not loaded")
+    raise
+
+##############################################################################
+# define a csv reader that reads space deliminated files
+print('Preparing for analysis using RepEnrich...')
+csv.field_size_limit(sys.maxsize)
+
+
+def import_text(filename, separator):
+    for line in csv.reader(open(filename), delimiter=separator,
+                           skipinitialspace=True):
+        if line:
+            yield line
+
+
+##############################################################################
+# build dictionaries to convert repclass and rep families'
+if is_bed == "FALSE":
+    repeatclass = {}
+    repeatfamily = {}
+    fin = import_text(annotation_file, ' ')
+    x = 0
+    for line in fin:
+        if x > 2:
+            classfamily = []
+            classfamily = line[10].split(os.path.sep)
+            line9 = line[9].replace("(", "_").replace(
+                                    ")", "_").replace("/", "_")
+            repeatclass[line9] = classfamily[0]
+            if len(classfamily) == 2:
+                repeatfamily[line9] = classfamily[1]
+            else:
+                repeatfamily[line9] = classfamily[0]
+        x += 1
+if is_bed == "TRUE":
+    repeatclass = {}
+    repeatfamily = {}
+    fin = open(annotation_file, 'r')
+    for line in fin:
+        line = line.strip('\n')
+        line = line.split('\t')
+        theclass = line[4]
+        thefamily = line[5]
+        line3 = line[3].replace("(", "_").replace(")", "_").replace("/", "_")
+        repeatclass[line3] = theclass
+        repeatfamily[line3] = thefamily
+fin.close()
+
+##############################################################################
+# build list of repeats initializing dictionaries for downstream analysis'
+fin = import_text(setup_folder + os.path.sep + 'repgenomes_key.txt', '\t')
+repeat_key = {}
+rev_repeat_key = {}
+repeat_list = []
+reptotalcounts = {}
+classfractionalcounts = {}
+familyfractionalcounts = {}
+classtotalcounts = {}
+familytotalcounts = {}
+reptotalcounts_simple = {}
+fractionalcounts = {}
+i = 0
+for line in fin:
+    reptotalcounts[line[0]] = 0
+    fractionalcounts[line[0]] = 0
+    if line[0] in repeatclass:
+        classtotalcounts[repeatclass[line[0]]] = 0
+        classfractionalcounts[repeatclass[line[0]]] = 0
+    if line[0] in repeatfamily:
+        familytotalcounts[repeatfamily[line[0]]] = 0
+        familyfractionalcounts[repeatfamily[line[0]]] = 0
+    if line[0] in repeatfamily:
+        if repeatfamily[line[0]] == simple_repeat:
+            reptotalcounts_simple[simple_repeat] = 0
+    else:
+        reptotalcounts_simple[line[0]] = 0
+    repeat_list.append(line[0])
+    repeat_key[line[0]] = int(line[1])
+    rev_repeat_key[int(line[1])] = line[0]
+fin.close()
+##############################################################################
+# map the repeats to the psuedogenomes:
+if not os.path.exists(outputfolder):
+    os.mkdir(outputfolder)
+##############################################################################
+# Conduct the regions sorting
+print('Conducting region sorting on unique mapping reads....')
+fileout = outputfolder + os.path.sep + outputfile_prefix + '_regionsorter.txt'
+with open(fileout, 'w') as stdout:
+    command = shlex.split("coverageBed -abam " + unique_mapper_bam + " -b " +
+                          setup_folder + os.path.sep + 'repnames.bed')
+    p = subprocess.Popen(command, stdout=stdout)
+p.communicate()
+stdout.close()
+filein = open(outputfolder + os.path.sep + outputfile_prefix
+              + '_regionsorter.txt', 'r')
+counts = {}
+sumofrepeatreads = 0
+for line in filein:
+    line = line.split('\t')
+    if not str(repeat_key[line[3]]) in counts:
+        counts[str(repeat_key[line[3]])] = 0
+    counts[str(repeat_key[line[3]])] += int(line[4])
+    sumofrepeatreads += int(line[4])
+print('Identified ' + str(sumofrepeatreads) +
+      'unique reads that mapped to repeats.')
+##############################################################################
+if paired_end == 'TRUE':
+    if not os.path.exists(outputfolder + os.path.sep + 'pair1_bowtie'):
+        os.mkdir(outputfolder + os.path.sep + 'pair1_bowtie')
+    if not os.path.exists(outputfolder + os.path.sep + 'pair2_bowtie'):
+        os.mkdir(outputfolder + os.path.sep + 'pair2_bowtie')
+    folder_pair1 = outputfolder + os.path.sep + 'pair1_bowtie'
+    folder_pair2 = outputfolder + os.path.sep + 'pair2_bowtie'
+##############################################################################
+    print("Processing repeat psuedogenomes...")
+    ps = []
+    psb = []
+    ticker = 0
+    for metagenome in repeat_list:
+        metagenomepath = setup_folder + os.path.sep + metagenome
+        file1 = folder_pair1 + os.path.sep + metagenome + '.bowtie'
+        file2 = folder_pair2 + os.path.sep + metagenome + '.bowtie'
+        with open(file1, 'w') as stdout:
+            command = shlex.split("bowtie " + b_opt + " " +
+                                  metagenomepath + " " + fastqfile_1)
+            p = subprocess.Popen(command, stdout=stdout)
+        with open(file2, 'w') as stdout:
+            command = shlex.split("bowtie " + b_opt + " " +
+                                  metagenomepath + " " + fastqfile_2)
+            pp = subprocess.Popen(command, stdout=stdout)
+        ps.append(p)
+        ticker += 1
+        psb.append(pp)
+        ticker += 1
+        if ticker == cpus:
+            for p in ps:
+                p.communicate()
+            for p in psb:
+                p.communicate()
+            ticker = 0
+            psb = []
+            ps = []
+    if len(ps) > 0:
+        for p in ps:
+            p.communicate()
+    stdout.close()
+
+##############################################################################
+# combine the output from both read pairs:
+    print('sorting and combining the output for both read pairs...')
+    if not os.path.exists(outputfolder + os.path.sep + 'sorted_bowtie'):
+        os.mkdir(outputfolder + os.path.sep + 'sorted_bowtie')
+    sorted_bowtie = outputfolder + os.path.sep + 'sorted_bowtie'
+    for metagenome in repeat_list:
+        file1 = folder_pair1 + os.path.sep + metagenome + '.bowtie'
+        file2 = folder_pair2 + os.path.sep + metagenome + '.bowtie'
+        fileout = sorted_bowtie + os.path.sep + metagenome + '.bowtie'
+        with open(fileout, 'w') as stdout:
+            p1 = subprocess.Popen(['cat', file1, file2],
+                                  stdout=subprocess.PIPE)
+            p2 = subprocess.Popen(['cut', '-f1', "-d "], stdin=p1.stdout,
+                                  stdout=subprocess.PIPE)
+            p3 = subprocess.Popen(['cut', '-f1', "-d/"], stdin=p2.stdout,
+                                  stdout=subprocess.PIPE)
+            p4 = subprocess.Popen(['sort'], stdin=p3.stdout,
+                                  stdout=subprocess.PIPE)
+            p5 = subprocess.Popen(['uniq'], stdin=p4.stdout, stdout=stdout)
+            p5.communicate()
+        stdout.close()
+    print('completed ...')
+##############################################################################
+if paired_end == 'FALSE':
+    if not os.path.exists(outputfolder + os.path.sep + 'pair1_bowtie'):
+        os.mkdir(outputfolder + os.path.sep + 'pair1_bowtie')
+    folder_pair1 = outputfolder + os.path.sep + 'pair1_bowtie'
+##############################################################################
+    ps = []
+    ticker = 0
+    print("Processing repeat psuedogenomes...")
+    for metagenome in repeat_list:
+        metagenomepath = setup_folder + os.path.sep + metagenome
+        file1 = folder_pair1 + os.path.sep + metagenome + '.bowtie'
+        with open(file1, 'w') as stdout:
+            command = shlex.split("bowtie " + b_opt + " " +
+                                  metagenomepath + " " + fastqfile_1)
+            p = subprocess.Popen(command, stdout=stdout)
+        ps.append(p)
+        ticker += 1
+        if ticker == cpus:
+            for p in ps:
+                p.communicate()
+            ticker = 0
+            ps = []
+    if len(ps) > 0:
+        for p in ps:
+            p.communicate()
+    stdout.close()
+
+##############################################################################
+# combine the output from both read pairs:
+    print('Sorting and combining the output for both read pairs....')
+    if not os.path.exists(outputfolder + os.path.sep + 'sorted_bowtie'):
+        os.mkdir(outputfolder + os.path.sep + 'sorted_bowtie')
+    sorted_bowtie = outputfolder + os.path.sep + 'sorted_bowtie'
+    for metagenome in repeat_list:
+        file1 = folder_pair1 + os.path.sep + metagenome + '.bowtie'
+        fileout = sorted_bowtie + os.path.sep + metagenome + '.bowtie'
+        with open(fileout, 'w') as stdout:
+            p1 = subprocess.Popen(['cat', file1], stdout=subprocess.PIPE)
+            p2 = subprocess.Popen(['cut', '-f1'], stdin=p1.stdout,
+                                  stdout=subprocess.PIPE)
+            p3 = subprocess.Popen(['cut', '-f1', "-d/"], stdin=p2.stdout,
+                                  stdout=subprocess.PIPE)
+            p4 = subprocess.Popen(['sort'], stdin=p3.stdout,
+                                  stdout=subprocess.PIPE)
+            p5 = subprocess.Popen(['uniq'], stdin=p4.stdout, stdout=stdout)
+            p5.communicate()
+        stdout.close()
+    print('completed ...')
+
+##############################################################################
+# build a file of repeat keys for all reads
+print('Writing and processing intermediate files...')
+sorted_bowtie = outputfolder + os.path.sep + 'sorted_bowtie'
+readid = {}
+sumofrepeatreads = 0
+for rep in repeat_list:
+    for data in import_text(sorted_bowtie + os.path.sep +
+                            rep + '.bowtie', '\t'):
+        readid[data[0]] = ''
+for rep in repeat_list:
+    for data in import_text(sorted_bowtie + os.path.sep
+                            + rep + '.bowtie', '\t'):
+        readid[data[0]] += str(repeat_key[rep]) + str(',')
+for subfamilies in readid.values():
+    if subfamilies not in counts:
+        counts[subfamilies] = 0
+    counts[subfamilies] += 1
+    sumofrepeatreads += 1
+del readid
+print('Identified ' + str(sumofrepeatreads) +
+      ' reads that mapped to repeats for unique and multimappers.')
+
+##############################################################################
+print("Conducting final calculations...")
+
+
+def convert(x):
+    '''
+    build a converter to numeric label for repeat and yield a combined list
+    of repnames seperated by backslash
+    '''
+    x = x.strip(',')
+    x = x.split(',')
+    global repname
+    repname = ""
+    for i in x:
+        repname = repname + os.path.sep + rev_repeat_key[int(i)]
+
+
+# building the total counts for repeat element enrichment...
+for x in counts.keys():
+    count = counts[x]
+    x = x.strip(',')
+    x = x.split(',')
+    for i in x:
+        reptotalcounts[rev_repeat_key[int(i)]] += int(count)
+# building the fractional counts for repeat element enrichment...
+for x in counts.keys():
+    count = counts[x]
+    x = x.strip(',')
+    x = x.split(',')
+    splits = len(x)
+    for i in x:
+        fractionalcounts[rev_repeat_key[int(i)]] += float(
+            numpy.divide(float(count), float(splits)))
+# building categorized table of repeat element enrichment...
+repcounts = {}
+repcounts['other'] = 0
+for key in counts.keys():
+        convert(key)
+        repcounts[repname] = counts[key]
+# building the total counts for class enrichment...
+for key in reptotalcounts.keys():
+    classtotalcounts[repeatclass[key]] += reptotalcounts[key]
+# building total counts for family enrichment...
+for key in reptotalcounts.keys():
+    familytotalcounts[repeatfamily[key]] += reptotalcounts[key]
+# building unique counts table'
+repcounts2 = {}
+for rep in repeat_list:
+    if "/" + rep in repcounts:
+        repcounts2[rep] = repcounts["/" + rep]
+    else:
+        repcounts2[rep] = 0
+# building the fractionalcounts counts for class enrichment...
+for key in fractionalcounts.keys():
+    classfractionalcounts[repeatclass[key]] += fractionalcounts[key]
+# building fractional counts for family enrichment...
+for key in fractionalcounts.keys():
+    familyfractionalcounts[repeatfamily[key]] += fractionalcounts[key]
+
+##############################################################################
+print('Writing final output and removing intermediate files...')
+# print output to file of the categorized counts and total overlapping counts:
+if allcountmethod == "TRUE":
+    fout1 = open(outputfolder + os.path.sep + outputfile_prefix
+                 + '_total_counts.txt', 'w')
+    for key in reptotalcounts.keys():
+        fout1.write(str(key) + '\t' + repeatclass[key] + '\t' +
+                    repeatfamily[key] + '\t' + str(reptotalcounts[key])
+                    + '\n')
+    fout2 = open(outputfolder + os.path.sep + outputfile_prefix
+                 + '_class_total_counts.txt', 'w')
+    for key in classtotalcounts.keys():
+        fout2.write(str(key) + '\t' + str(classtotalcounts[key]) + '\n')
+    fout3 = open(outputfolder + os.path.sep + outputfile_prefix
+                 + '_family_total_counts.txt', 'w')
+    for key in familytotalcounts.keys():
+        fout3.write(str(key) + '\t' + str(familytotalcounts[key]) + '\n')
+    fout4 = open(outputfolder + os.path.sep + outputfile_prefix +
+                 '_unique_counts.txt', 'w')
+    for key in repcounts2.keys():
+        fout4.write(str(key) + '\t' + repeatclass[key] + '\t' +
+                    repeatfamily[key] + '\t' + str(repcounts2[key]) + '\n')
+        fout5 = open(outputfolder + os.path.sep + outputfile_prefix
+                     + '_class_fraction_counts.txt', 'w')
+    for key in classfractionalcounts.keys():
+        fout5.write(str(key) + '\t' + str(classfractionalcounts[key]) + '\n')
+    fout6 = open(outputfolder + os.path.sep + outputfile_prefix +
+                 '_family_fraction_counts.txt', 'w')
+    for key in familyfractionalcounts.keys():
+        fout6.write(str(key) + '\t' + str(familyfractionalcounts[key]) + '\n')
+    fout7 = open(outputfolder + os.path.sep + outputfile_prefix
+                 + '_fraction_counts.txt', 'w')
+    for key in fractionalcounts.keys():
+        fout7.write(str(key) + '\t' + repeatclass[key] + '\t' +
+                    repeatfamily[key] + '\t' + str(int(fractionalcounts[key]))
+                    + '\n')
+        fout1.close()
+    fout2.close()
+    fout3.close()
+    fout4.close()
+    fout5.close()
+    fout6.close()
+    fout7.close()
+else:
+    fout1 = open(outputfolder + os.path.sep + outputfile_prefix +
+                 '_class_fraction_counts.txt', 'w')
+    for key in classfractionalcounts.keys():
+        fout1.write(str(key) + '\t' + str(classfractionalcounts[key]) + '\n')
+    fout2 = open(outputfolder + os.path.sep + outputfile_prefix +
+                 '_family_fraction_counts.txt', 'w')
+    for key in familyfractionalcounts.keys():
+        fout2.write(str(key) + '\t' + str(familyfractionalcounts[key]) + '\n')
+    fout3 = open(outputfolder + os.path.sep + outputfile_prefix +
+                 '_fraction_counts.txt', 'w')
+    for key in fractionalcounts.keys():
+        fout3.write(str(key) + '\t' + repeatclass[key] + '\t' +
+                    repeatfamily[key] + '\t' + str(int(fractionalcounts[key]))
+                    + '\n')
+    fout1.close()
+    fout2.close()
+    fout3.close()
+##############################################################################
+#  Remove Large intermediate files
+if os.path.exists(outputfolder + os.path.sep + outputfile_prefix +
+                  '_regionsorter.txt'):
+    os.remove(outputfolder + os.path.sep + outputfile_prefix +
+              '_regionsorter.txt')
+if os.path.exists(outputfolder + os.path.sep + 'pair1_bowtie'):
+    shutil.rmtree(outputfolder + os.path.sep + 'pair1_bowtie')
+if os.path.exists(outputfolder + os.path.sep + 'pair2_bowtie'):
+    shutil.rmtree(outputfolder + os.path.sep + 'pair2_bowtie')
+if os.path.exists(outputfolder + os.path.sep + 'sorted_bowtie'):
+    shutil.rmtree(outputfolder + os.path.sep + 'sorted_bowtie')
+print("... Done")