diff ChipSeqRatioAnalysis.py @ 3:e320ef2d105a draft

Uploaded
author petr-novak
date Thu, 05 Sep 2019 09:04:56 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ChipSeqRatioAnalysis.py	Thu Sep 05 09:04:56 2019 -0400
@@ -0,0 +1,202 @@
+#!/usr/bin/env python3
+import re
+import argparse
+import csv
+import os
+import sys
+import os.path
+import subprocess
+import shlex
+import multiprocessing as mp
+import tempfile
+import itertools as it
+
+
+def get_arguments():
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-m',
+                        '--max_cl',
+                        default=300,
+                        type=int,
+                        help='Sets the maximum cluster number. Default = 300')
+    parser.add_argument('-b',
+                        '--bitscore',
+                        default=30,
+                        type=int,
+                        help='minimal bitscore to report')
+    parser.add_argument(
+        '-n',
+        '--nproc',
+        default=mp.cpu_count(),
+        type=int,
+        help='Sets the number of cpus to be used. Default = all available')
+    parser.add_argument('-c',
+                        '--ChipSeq',
+                        required=True,
+                        help='Fasta file containing the Chip Sequences')
+    parser.add_argument('-i',
+                        '--InputSeq',
+                        required=True,
+                        help='Fasta file containing the Input Sequences')
+    parser.add_argument(
+        '-o',
+        '--output',
+        required=False,
+        default='ChipSeqRatio.csv',
+        help=('Specify a name for the CSV file to which the'
+              ' output will be save to. Default: ChipSeqRatio.csv'))
+    parser.add_argument(
+        '-ht',
+        '--html',
+        required=False,
+        default='ChipSeqRatioReport',
+        help='Specify a name for the html report. Default : ChipSeqRatioReport')
+    parser.add_argument('-k',
+                        '--Contigs',
+                        required=True,
+                        help='Contig file for blast')
+    args = parser.parse_args()
+    return args
+
+
+def split(filename, num_chunks):
+    # splits a files into nproc files
+    files = []
+    temp_files = []
+
+    # creating a list with nproc temporary files
+    for f in range(num_chunks):
+        temp_files.append(tempfile.NamedTemporaryFile('w', delete=False))
+    with open(filename, 'r') as f:
+        end = os.path.getsize(filename)
+        for temp in it.cycle(temp_files):
+            # cycling indefenitly through temp files
+            ID = f.readline()  # get sequence id
+            if ID[0] == '>':
+                temp.write(ID)
+                SEQP = f.tell()  # get file pointer location
+                while f.readline()[0] is not '>':
+                    f.seek(SEQP)  # jump to last saved file pointer
+                    temp.write(f.readline())  # write sequen
+                    SEQP = f.tell()  # overwrite last file pointer location
+                    # break loop if file pointer reached the EOF
+                    if (SEQP == end):
+                        break
+            if (SEQP == end):  # break loop if file pointer reached the EOF
+                break
+            f.seek(SEQP)
+
+    for f in range(num_chunks):
+        # save temp files names into list for further use
+        files.append(temp_files[f].name)
+        temp_files[f].close()  # close temp files
+    return files
+
+
+def blast(query, database, bitscore):
+    # blast a file for given arguments and save result to a list
+    print(query)
+    arguments = (
+        "blastn -task blastn -db {} -query {} "
+        "-evalue 1e-2 -gapopen 5 -gapextend 2 -word_size 11 -num_alignments 1"
+        " -penalty -3 -reward 2 -outfmt 6 -dust no").format(database, query)
+    cmd = shlex.split(arguments)
+    Blast_Output = [0 for x in range(max_cl + 1)]
+    ma = re.compile('(\S+)\tCL(\d+)Contig')  # expression to check for
+    with subprocess.Popen(cmd,
+                          stdout=subprocess.PIPE,
+                          universal_newlines=True) as p:
+        for line in p.stdout:
+            if float(line.split()[11]) > bitscore:
+                gr = ma.match(line)
+                previous_query = ''
+                if gr:
+                    if (gr.group(1) != previous_query):
+                        if (int(gr.group(2)) > max_cl):
+                            Blast_Output[0] = Blast_Output[0] + 1
+                        else:
+                            Blast_Output[int(gr.group(2))] = Blast_Output[
+                                int(gr.group(2))] + 1
+                        previous_query = gr.group(1)
+    return Blast_Output
+
+
+def ReduceLists(x, y):
+    ''' reduces two lists into a 2-dim matrix '''
+    Matrix = [[0 for i in range(max_cl + 1)] for i in range(2)]
+    for i in range(len(x)):
+        for j in range(len(x[i])):
+            Matrix[0][j] = Matrix[0][j] + x[i][j]
+    for i in range(len(y)):
+        for j in range(len(y[i])):
+            Matrix[1][j] = Matrix[1][j] + y[i][j]
+    return Matrix
+
+
+def fasta_size(fastafile):
+    with open(fastafile, 'r') as f:
+        s = 0
+        for i in f:
+            if i[0] == ">":
+                s += 1
+    return s
+
+
+def makeblastdb(filename):
+    dbtmp = tempfile.NamedTemporaryFile()
+    cmd = [
+        'makeblastdb', '-in', filename, '-input_type', 'fasta', '-dbtype',
+        'nucl', '-out', dbtmp.name
+    ]
+    subprocess.call(cmd)
+    return dbtmp
+
+
+if __name__ == "__main__":
+    args = get_arguments()
+    max_cl = args.max_cl
+    output = args.output
+    HTMLreport = args.html
+    contigs = args.Contigs
+
+    # Creation of database
+    db = makeblastdb(contigs)
+
+    inputN = fasta_size(args.InputSeq)
+    chipN = fasta_size(args.ChipSeq)
+
+    # Reading and distribution of data to temp files for multiprocessing
+    filesC = split(args.ChipSeq, args.nproc)
+    filesI = split(args.InputSeq, args.nproc)
+
+    # start of parallized blast
+    pool = mp.Pool(processes=args.nproc)
+    results = [pool.apply_async(blast, args=(f, db.name, args.bitscore)) for f in filesC]
+    Cout = [p.get() for p in results]
+    results = [pool.apply_async(blast, args=(f, db.name, args.bitscore)) for f in filesI]
+    Iout = [p.get() for p in results]
+
+    # Merging of blast output into a 2-dim matrix
+    Matrix = ReduceLists(Cout, Iout)
+
+    with open(args.output, 'w') as f:
+        print("Cluster", "Chip_Hits", "Input_Hits", sep='\t', file=f)
+        for hit in range(1, args.max_cl + 1):
+            print(hit, Matrix[0][hit], Matrix[1][hit], sep='\t', file=f)
+    Rarguments = "Rscript " + \
+        os.path.dirname(__file__) + "/ChipSeqRatioAnalysis.R"
+    # order is important - programmed by georg - this it realy ugly!
+    args = shlex.split(Rarguments)
+    args.append(output)
+    args.append(HTMLreport)
+    args.append(str(inputN))
+    args.append(str(chipN))
+    with subprocess.Popen(args, stderr=subprocess.PIPE) as p:
+        print("Creating HTML report")
+        stdout, stderr = p.communicate()
+        if (len(stderr) > 0):
+            print(stderr)
+    # cleanup
+    for i in filesC + filesI:
+        os.unlink(i)