view RepEnrich.py @ 10:6f4143893463 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit e3881f05134c6f50889d0376d27e1c232251f8b3
author artbio
date Tue, 05 Feb 2019 17:20:55 -0500
parents d1f7ab78f7b5
children 89e05f831259
line wrap: on
line source

import argparse
import csv
import os
import shlex
import shutil
import subprocess
import sys

import numpy


parser = argparse.ArgumentParser(description='''
             Part II: Conducting the alignments to the psuedogenomes. Before\
             doing this step you will require 1) a bamfile of the unique\
             alignments with index 2) a fastq file of the reads mapping to\
             more than one location. These files can be obtained using the\
             following bowtie options [EXAMPLE: bowtie -S -m 1\
             --max multimap.fastq mm9 mate1_reads.fastq]  Once you have the\
             unique alignment bamfile and the reads mapping to more than one\
             location in a fastq file you can run this step.  EXAMPLE: python\
             master_output.py\
             /users/nneretti/data/annotation/hg19/hg19_repeatmasker.txt\
             /users/nneretti/datasets/repeatmapping/POL3/Pol3_human/
             HeLa_InputChIPseq_Rep1 HeLa_InputChIPseq_Rep1\
             /users/nneretti/data/annotation/hg19/setup_folder\
             HeLa_InputChIPseq_Rep1_multimap.fastq\
             HeLa_InputChIPseq_Rep1.bam''')
parser.add_argument('--version', action='version', version='%(prog)s 0.1')
parser.add_argument('annotation_file', action='store',
                    metavar='annotation_file',
                    help='List RepeatMasker.org annotation file for your\
                          organism.  The file may be downloaded from the\
                          RepeatMasker.org website. Example:\
                          /data/annotation/hg19/hg19_repeatmasker.txt')
parser.add_argument('outputfolder', action='store', metavar='outputfolder',
                    help='List folder to contain results.\
                          Example: /outputfolder')
parser.add_argument('outputprefix', action='store', metavar='outputprefix',
                    help='Enter prefix name for data.\
                           Example: HeLa_InputChIPseq_Rep1')
parser.add_argument('setup_folder', action='store', metavar='setup_folder',
                    help='List folder that contains the repeat element\
                          pseudogenomes.\
                          Example: /data/annotation/hg19/setup_folder')
parser.add_argument('fastqfile', action='store', metavar='fastqfile',
                    help='Enter file for the fastq reads that map to multiple\
                          locations. Example: /data/multimap.fastq')
parser.add_argument('alignment_bam', action='store', metavar='alignment_bam',
                    help='Enter bamfile output for reads that map uniquely.\
                    Example /bamfiles/old.bam')
parser.add_argument('--pairedend', action='store', dest='pairedend',
                    default='FALSE',
                    help='Designate this option for paired-end sequencing.\
                          Default FALSE change to TRUE')
parser.add_argument('--collapserepeat', action='store', dest='collapserepeat',
                    metavar='collapserepeat', default='Simple_repeat',
                    help='Designate this option to generate a collapsed repeat\
                          type. Uncollapsed output is generated in addition to\
                          collapsed repeat type. Simple_repeat is default to\
                          simplify downstream analysis. You can change the\
                          default to another repeat name to collapse a\
                          seperate specific repeat instead or if the name of\
                          Simple_repeat is different for your organism.\
                          Default Simple_repeat')
parser.add_argument('--fastqfile2', action='store', dest='fastqfile2',
                    metavar='fastqfile2', default='none',
                    help='Enter fastqfile2 when using paired-end option.\
                          Default none')
parser.add_argument('--cpus', action='store', dest='cpus', metavar='cpus',
                    default="1", type=int,
                    help='Enter available cpus per node.  The more cpus the\
                          faster RepEnrich performs. RepEnrich is designed to\
                          only work on one node. Default: "1"')
parser.add_argument('--allcountmethod', action='store', dest='allcountmethod',
                    metavar='allcountmethod', default="FALSE",
                    help='By default the pipeline only outputs the fraction\
                          count method.  Consdidered to be the best way to\
                          count multimapped reads. Changing this option will\
                          include the unique count method, a conservative\
                          count, and the total count method, a liberal\
                          counting strategy. Our evaluation of simulated data\
                          indicated fraction counting is best.\
                          Default = FALSE, change to TRUE')
parser.add_argument('--is_bed', action='store', dest='is_bed',
                    metavar='is_bed', default='FALSE',
                    help='Is the annotation file a bed file.\
                         This is also a compatible format. The file needs to\
                         be a tab seperated bed with optional fields.\
                         Ex. format: chr\tstart\tend\tName_element\tclass\
                         \tfamily. The class and family should identical to\
                         name_element if not applicable.\
                         Default FALSE change to TRUE')
args = parser.parse_args()

# parameters
annotation_file = args.annotation_file
outputfolder = args.outputfolder
outputfile_prefix = args.outputprefix
setup_folder = args.setup_folder
repeat_bed = setup_folder + os.path.sep + 'repnames.bed'
unique_mapper_bam = args.alignment_bam
fastqfile_1 = args.fastqfile
fastqfile_2 = args.fastqfile2
cpus = args.cpus
b_opt = "-k1 -p " + str(1) + " --quiet"
simple_repeat = args.collapserepeat
paired_end = args.pairedend
allcountmethod = args.allcountmethod
is_bed = args.is_bed

##############################################################################
# check that the programs we need are available
try:
    subprocess.call(shlex.split("coverageBed -h"),
                    stdout=open(os.devnull, 'wb'),
                    stderr=open(os.devnull, 'wb'))
    subprocess.call(shlex.split("bowtie --version"),
                    stdout=open(os.devnull, 'wb'),
                    stderr=open(os.devnull, 'wb'))
except OSError:
    print("Error: Bowtie or BEDTools not loaded")
    raise

##############################################################################
# define a csv reader that reads space deliminated files
print('Preparing for analysis using RepEnrich...')
csv.field_size_limit(sys.maxsize)


def import_text(filename, separator):
    for line in csv.reader(open(filename), delimiter=separator,
                           skipinitialspace=True):
        if line:
            yield line


##############################################################################
# build dictionaries to convert repclass and rep families'
if is_bed == "FALSE":
    repeatclass = {}
    repeatfamily = {}
    fin = import_text(annotation_file, ' ')
    x = 0
    for line in fin:
        if x > 2:
            classfamily = []
            classfamily = line[10].split(os.path.sep)
            line9 = line[9].replace("(", "_").replace(
                                    ")", "_").replace("/", "_")
            repeatclass[line9] = classfamily[0]
            if len(classfamily) == 2:
                repeatfamily[line9] = classfamily[1]
            else:
                repeatfamily[line9] = classfamily[0]
        x += 1
if is_bed == "TRUE":
    repeatclass = {}
    repeatfamily = {}
    fin = open(annotation_file, 'r')
    for line in fin:
        line = line.strip('\n')
        line = line.split('\t')
        theclass = line[4]
        thefamily = line[5]
        line3 = line[3].replace("(", "_").replace(")", "_").replace("/", "_")
        repeatclass[line3] = theclass
        repeatfamily[line3] = thefamily
fin.close()

##############################################################################
# build list of repeats initializing dictionaries for downstream analysis'
fin = import_text(setup_folder + os.path.sep + 'repgenomes_key.txt', '\t')
repeat_key = {}
rev_repeat_key = {}
repeat_list = []
reptotalcounts = {}
classfractionalcounts = {}
familyfractionalcounts = {}
classtotalcounts = {}
familytotalcounts = {}
reptotalcounts_simple = {}
fractionalcounts = {}
i = 0
for line in fin:
    reptotalcounts[line[0]] = 0
    fractionalcounts[line[0]] = 0
    if line[0] in repeatclass:
        classtotalcounts[repeatclass[line[0]]] = 0
        classfractionalcounts[repeatclass[line[0]]] = 0
    if line[0] in repeatfamily:
        familytotalcounts[repeatfamily[line[0]]] = 0
        familyfractionalcounts[repeatfamily[line[0]]] = 0
    if line[0] in repeatfamily:
        if repeatfamily[line[0]] == simple_repeat:
            reptotalcounts_simple[simple_repeat] = 0
    else:
        reptotalcounts_simple[line[0]] = 0
    repeat_list.append(line[0])
    repeat_key[line[0]] = int(line[1])
    rev_repeat_key[int(line[1])] = line[0]
fin.close()
##############################################################################
# map the repeats to the psuedogenomes:
if not os.path.exists(outputfolder):
    os.mkdir(outputfolder)
##############################################################################
# Conduct the regions sorting
print('Conducting region sorting on unique mapping reads....')
fileout = outputfolder + os.path.sep + outputfile_prefix + '_regionsorter.txt'
with open(fileout, 'w') as stdout:
    command = shlex.split("coverageBed -abam " + unique_mapper_bam + " -b " +
                          setup_folder + os.path.sep + 'repnames.bed')
    p = subprocess.Popen(command, stdout=stdout)
p.communicate()
stdout.close()
filein = open(outputfolder + os.path.sep + outputfile_prefix
              + '_regionsorter.txt', 'r')
counts = {}
sumofrepeatreads = 0
for line in filein:
    line = line.split('\t')
    if not str(repeat_key[line[3]]) in counts:
        counts[str(repeat_key[line[3]])] = 0
    counts[str(repeat_key[line[3]])] += int(line[4])
    sumofrepeatreads += int(line[4])
print('Identified ' + str(sumofrepeatreads) +
      'unique reads that mapped to repeats.')
##############################################################################
if paired_end == 'TRUE':
    if not os.path.exists(outputfolder + os.path.sep + 'pair1_bowtie'):
        os.mkdir(outputfolder + os.path.sep + 'pair1_bowtie')
    if not os.path.exists(outputfolder + os.path.sep + 'pair2_bowtie'):
        os.mkdir(outputfolder + os.path.sep + 'pair2_bowtie')
    folder_pair1 = outputfolder + os.path.sep + 'pair1_bowtie'
    folder_pair2 = outputfolder + os.path.sep + 'pair2_bowtie'
##############################################################################
    print("Processing repeat psuedogenomes...")
    ps = []
    psb = []
    ticker = 0
    for metagenome in repeat_list:
        metagenomepath = setup_folder + os.path.sep + metagenome
        file1 = folder_pair1 + os.path.sep + metagenome + '.bowtie'
        file2 = folder_pair2 + os.path.sep + metagenome + '.bowtie'
        with open(file1, 'w') as stdout:
            command = shlex.split("bowtie " + b_opt + " " +
                                  metagenomepath + " " + fastqfile_1)
            p = subprocess.Popen(command, stdout=stdout)
        with open(file2, 'w') as stdout:
            command = shlex.split("bowtie " + b_opt + " " +
                                  metagenomepath + " " + fastqfile_2)
            pp = subprocess.Popen(command, stdout=stdout)
        ps.append(p)
        ticker += 1
        psb.append(pp)
        ticker += 1
        if ticker == cpus:
            for p in ps:
                p.communicate()
            for p in psb:
                p.communicate()
            ticker = 0
            psb = []
            ps = []
    if len(ps) > 0:
        for p in ps:
            p.communicate()
    stdout.close()

##############################################################################
# combine the output from both read pairs:
    print('sorting and combining the output for both read pairs...')
    if not os.path.exists(outputfolder + os.path.sep + 'sorted_bowtie'):
        os.mkdir(outputfolder + os.path.sep + 'sorted_bowtie')
    sorted_bowtie = outputfolder + os.path.sep + 'sorted_bowtie'
    for metagenome in repeat_list:
        file1 = folder_pair1 + os.path.sep + metagenome + '.bowtie'
        file2 = folder_pair2 + os.path.sep + metagenome + '.bowtie'
        fileout = sorted_bowtie + os.path.sep + metagenome + '.bowtie'
        with open(fileout, 'w') as stdout:
            p1 = subprocess.Popen(['cat', file1, file2],
                                  stdout=subprocess.PIPE)
            p2 = subprocess.Popen(['cut', '-f1', "-d "], stdin=p1.stdout,
                                  stdout=subprocess.PIPE)
            p3 = subprocess.Popen(['cut', '-f1', "-d/"], stdin=p2.stdout,
                                  stdout=subprocess.PIPE)
            p4 = subprocess.Popen(['sort'], stdin=p3.stdout,
                                  stdout=subprocess.PIPE)
            p5 = subprocess.Popen(['uniq'], stdin=p4.stdout, stdout=stdout)
            p5.communicate()
        stdout.close()
    print('completed ...')
##############################################################################
if paired_end == 'FALSE':
    if not os.path.exists(outputfolder + os.path.sep + 'pair1_bowtie'):
        os.mkdir(outputfolder + os.path.sep + 'pair1_bowtie')
    folder_pair1 = outputfolder + os.path.sep + 'pair1_bowtie'
##############################################################################
    ps = []
    ticker = 0
    print("Processing repeat psuedogenomes...")
    for metagenome in repeat_list:
        metagenomepath = setup_folder + os.path.sep + metagenome
        file1 = folder_pair1 + os.path.sep + metagenome + '.bowtie'
        with open(file1, 'w') as stdout:
            command = shlex.split("bowtie " + b_opt + " " +
                                  metagenomepath + " " + fastqfile_1)
            p = subprocess.Popen(command, stdout=stdout)
        ps.append(p)
        ticker += 1
        if ticker == cpus:
            for p in ps:
                p.communicate()
            ticker = 0
            ps = []
    if len(ps) > 0:
        for p in ps:
            p.communicate()
    stdout.close()

##############################################################################
# combine the output from both read pairs:
    print('Sorting and combining the output for both read pairs....')
    if not os.path.exists(outputfolder + os.path.sep + 'sorted_bowtie'):
        os.mkdir(outputfolder + os.path.sep + 'sorted_bowtie')
    sorted_bowtie = outputfolder + os.path.sep + 'sorted_bowtie'
    for metagenome in repeat_list:
        file1 = folder_pair1 + os.path.sep + metagenome + '.bowtie'
        fileout = sorted_bowtie + os.path.sep + metagenome + '.bowtie'
        with open(fileout, 'w') as stdout:
            p1 = subprocess.Popen(['cat', file1], stdout=subprocess.PIPE)
            p2 = subprocess.Popen(['cut', '-f1'], stdin=p1.stdout,
                                  stdout=subprocess.PIPE)
            p3 = subprocess.Popen(['cut', '-f1', "-d/"], stdin=p2.stdout,
                                  stdout=subprocess.PIPE)
            p4 = subprocess.Popen(['sort'], stdin=p3.stdout,
                                  stdout=subprocess.PIPE)
            p5 = subprocess.Popen(['uniq'], stdin=p4.stdout, stdout=stdout)
            p5.communicate()
        stdout.close()
    print('completed ...')

##############################################################################
# build a file of repeat keys for all reads
print('Writing and processing intermediate files...')
sorted_bowtie = outputfolder + os.path.sep + 'sorted_bowtie'
readid = {}
sumofrepeatreads = 0
for rep in repeat_list:
    for data in import_text(sorted_bowtie + os.path.sep +
                            rep + '.bowtie', '\t'):
        readid[data[0]] = ''
for rep in repeat_list:
    for data in import_text(sorted_bowtie + os.path.sep
                            + rep + '.bowtie', '\t'):
        readid[data[0]] += str(repeat_key[rep]) + str(',')
for subfamilies in readid.values():
    if subfamilies not in counts:
        counts[subfamilies] = 0
    counts[subfamilies] += 1
    sumofrepeatreads += 1
del readid
print('Identified ' + str(sumofrepeatreads) +
      ' reads that mapped to repeats for unique and multimappers.')

##############################################################################
print("Conducting final calculations...")


def convert(x):
    '''
    build a converter to numeric label for repeat and yield a combined list
    of repnames seperated by backslash
    '''
    x = x.strip(',')
    x = x.split(',')
    global repname
    repname = ""
    for i in x:
        repname = repname + os.path.sep + rev_repeat_key[int(i)]


# building the total counts for repeat element enrichment...
for x in counts.keys():
    count = counts[x]
    x = x.strip(',')
    x = x.split(',')
    for i in x:
        reptotalcounts[rev_repeat_key[int(i)]] += int(count)
# building the fractional counts for repeat element enrichment...
for x in counts.keys():
    count = counts[x]
    x = x.strip(',')
    x = x.split(',')
    splits = len(x)
    for i in x:
        fractionalcounts[rev_repeat_key[int(i)]] += float(
            numpy.divide(float(count), float(splits)))
# building categorized table of repeat element enrichment...
repcounts = {}
repcounts['other'] = 0
for key in counts.keys():
    convert(key)
    repcounts[repname] = counts[key]
# building the total counts for class enrichment...
for key in reptotalcounts.keys():
    classtotalcounts[repeatclass[key]] += reptotalcounts[key]
# building total counts for family enrichment...
for key in reptotalcounts.keys():
    familytotalcounts[repeatfamily[key]] += reptotalcounts[key]
# building unique counts table'
repcounts2 = {}
for rep in repeat_list:
    if "/" + rep in repcounts:
        repcounts2[rep] = repcounts["/" + rep]
    else:
        repcounts2[rep] = 0
# building the fractionalcounts counts for class enrichment...
for key in fractionalcounts.keys():
    classfractionalcounts[repeatclass[key]] += fractionalcounts[key]
# building fractional counts for family enrichment...
for key in fractionalcounts.keys():
    familyfractionalcounts[repeatfamily[key]] += fractionalcounts[key]

##############################################################################
print('Writing final output and removing intermediate files...')
# print output to file of the categorized counts and total overlapping counts:
if allcountmethod == "TRUE":
    fout1 = open(outputfolder + os.path.sep + outputfile_prefix
                 + '_total_counts.txt', 'w')
    for key in sorted(reptotalcounts.keys()):
        fout1.write(str(key) + '\t' + repeatclass[key] + '\t' +
                    repeatfamily[key] + '\t' + str(reptotalcounts[key])
                    + '\n')
    fout2 = open(outputfolder + os.path.sep + outputfile_prefix
                 + '_class_total_counts.txt', 'w')
    for key in sorted(classtotalcounts.keys()):
        fout2.write(str(key) + '\t' + str(classtotalcounts[key]) + '\n')
    fout3 = open(outputfolder + os.path.sep + outputfile_prefix
                 + '_family_total_counts.txt', 'w')
    for key in sorted(familytotalcounts.keys()):
        fout3.write(str(key) + '\t' + str(familytotalcounts[key]) + '\n')
    fout4 = open(outputfolder + os.path.sep + outputfile_prefix +
                 '_unique_counts.txt', 'w')
    for key in sorted(repcounts2.keys()):
        fout4.write(str(key) + '\t' + repeatclass[key] + '\t' +
                    repeatfamily[key] + '\t' + str(repcounts2[key]) + '\n')
        fout5 = open(outputfolder + os.path.sep + outputfile_prefix
                     + '_class_fraction_counts.txt', 'w')
    for key in sorted(classfractionalcounts.keys()):
        fout5.write(str(key) + '\t' + str(classfractionalcounts[key]) + '\n')
    fout6 = open(outputfolder + os.path.sep + outputfile_prefix +
                 '_family_fraction_counts.txt', 'w')
    for key in sorted(familyfractionalcounts.keys()):
        fout6.write(str(key) + '\t' + str(familyfractionalcounts[key]) + '\n')
    fout7 = open(outputfolder + os.path.sep + outputfile_prefix
                 + '_fraction_counts.txt', 'w')
    for key in sorted(fractionalcounts.keys()):
        fout7.write(str(key) + '\t' + repeatclass[key] + '\t' +
                    repeatfamily[key] + '\t' + str(int(fractionalcounts[key]))
                    + '\n')
        fout1.close()
    fout2.close()
    fout3.close()
    fout4.close()
    fout5.close()
    fout6.close()
    fout7.close()
else:
    fout1 = open(outputfolder + os.path.sep + outputfile_prefix +
                 '_class_fraction_counts.txt', 'w')
    for key in sorted(classfractionalcounts.keys()):
        fout1.write(str(key) + '\t' + str(classfractionalcounts[key]) + '\n')
    fout2 = open(outputfolder + os.path.sep + outputfile_prefix +
                 '_family_fraction_counts.txt', 'w')
    for key in sorted(familyfractionalcounts.keys()):
        fout2.write(str(key) + '\t' + str(familyfractionalcounts[key]) + '\n')
    fout3 = open(outputfolder + os.path.sep + outputfile_prefix +
                 '_fraction_counts.txt', 'w')
    for key in sorted(fractionalcounts.keys()):
        fout3.write(str(key) + '\t' + repeatclass[key] + '\t' +
                    repeatfamily[key] + '\t' + str(int(fractionalcounts[key]))
                    + '\n')
    fout1.close()
    fout2.close()
    fout3.close()
##############################################################################
#  Remove Large intermediate files
if os.path.exists(outputfolder + os.path.sep + outputfile_prefix +
                  '_regionsorter.txt'):
    os.remove(outputfolder + os.path.sep + outputfile_prefix +
              '_regionsorter.txt')
if os.path.exists(outputfolder + os.path.sep + 'pair1_bowtie'):
    shutil.rmtree(outputfolder + os.path.sep + 'pair1_bowtie')
if os.path.exists(outputfolder + os.path.sep + 'pair2_bowtie'):
    shutil.rmtree(outputfolder + os.path.sep + 'pair2_bowtie')
if os.path.exists(outputfolder + os.path.sep + 'sorted_bowtie'):
    shutil.rmtree(outputfolder + os.path.sep + 'sorted_bowtie')
print("... Done")