profrep: profrep.py comparison

comparison profrep.py @ 0:a5f1638b73be draft

Uploaded

author	petr-novak
date	Wed, 26 Jun 2019 08:01:42 -0400
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:a5f1638b73be
+#!/usr/bin/env python3
+import subprocess
+import csv
+import time
+import sys
+import matplotlib
+# matplotlib.use("PDF")
+matplotlib.use("pdf")
+import matplotlib.pyplot as plt
+import matplotlib.colors as colors
+import matplotlib.cm as cmx
+import multiprocessing
+import argparse
+import os
+from functools import partial
+from multiprocessing import Pool
+from tempfile import NamedTemporaryFile
+from operator import itemgetter
+from itertools import groupby
+import gff
+import configuration
+import visualization
+import distutils
+from distutils import dir_util
+import tempfile
+import re
+from Bio import SeqIO
+import sys
+import pickle
+import shutil
+import warnings
+import random
+import numpy as np
+import dante_gff_output_filtering as domains_filtering
+import dante as protein_domains
+t_profrep = time.time()
+np.set_printoptions(threshold=sys.maxsize)
+warnings.filterwarnings("ignore", module="matplotlib")
+class Range():
+'''
+This class is used to check float range in argparse
+'''
+def __init__(self, start, end):
+self.start = start
+self.end = end
+def __eq__(self, other):
+return self.start <= other <= self.end
+def __str__(self):
+return "float range {}..{}".format(self.start, self.end)
+def __repr__(self):
+return "float range {}..{}".format(self.start, self.end)
+def get_version(path):
+branch = subprocess.check_output("git rev-parse --abbrev-ref HEAD",
+shell=True,
+cwd=path).decode('ascii').strip()
+shorthash = subprocess.check_output("git log --pretty=format:'%h' -n 1  ",
+shell=True,
+cwd=path).decode('ascii').strip()
+revcount = len(subprocess.check_output("git log --oneline",
+shell=True,
+cwd=path).decode('ascii').split())
+version_string = ("-------------------------------------"
+"-------------------------------------\n"
+"PIPELINE VERSION         : "
+"{branch}-rv-{revcount}({shorthash})\n"
+"-------------------------------------"
+"-------------------------------------\n").format(
+branch=branch,
+shorthash=shorthash,
+revcount=revcount, )
+return (version_string)
+def str2bool(v):
+if v.lower() in ('yes', 'true', 't', 'y', '1'):
+return True
+elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+return False
+else:
+raise argparse.ArgumentTypeError('Boolean value expected')
+def check_fasta_id(QUERY):
+forbidden_ids = []
+headers = []
+for record in SeqIO.parse(QUERY, "fasta"):
+if any(x in record.id for x in configuration.FORBIDDEN_CHARS):
+forbidden_ids.append(record.id)
+headers.append(record.id)
+if len(headers) > len(set([header.split(" ")[0] for header in headers])):
+raise NameError(
+'''Sequences in multifasta format are not named correctly:
+							seq IDs(before the first space) are the same''')
+return forbidden_ids, headers
+def multifasta(QUERY):
+''' Create single fasta temporary files to be processed sequentially '''
+PATTERN = ">"
+fasta_list = []
+with open(QUERY, "r") as fasta:
+reader = fasta.read()
+splitter = reader.split(PATTERN)[1:]
+for fasta_num, part in enumerate(splitter):
+ntf = NamedTemporaryFile(delete=False)
+ntf.write("{}{}".format(PATTERN, part).encode("utf-8"))
+fasta_list.append(ntf.name)
+ntf.close()
+return fasta_list
+def fasta_read(subfasta):
+''' Read fasta, gain header and sequence without gaps '''
+sequence_lines = []
+with open(subfasta, "r") as fasta:
+header = fasta.readline().strip().split(" ")[0][1:]
+for line in fasta:
+clean_line = line.strip()
+if clean_line:
+sequence_lines.append(clean_line)
+sequence = "".join(sequence_lines)
+return header, sequence
+def cluster_annotation(CL_ANNOTATION_TBL):
+''' Create dictionary of known annotations classes and related clusters '''
+cl_annotations = {}
+annot_table = np.genfromtxt(CL_ANNOTATION_TBL, dtype=str)
+for line in annot_table:
+if line[1] in cl_annotations:
+cl_annotations[line[1]].append(line[0])
+else:
+cl_annotations[line[1]] = [line[0]]
+return list(cl_annotations.items()), list(cl_annotations.keys())
+def read_annotation(CLS, cl_annotations_items):
+''' Dictionary of known repeat classes and related reads '''
+reads_annotations = {}
+with open(CLS, "r") as cls_file:
+count = 0
+for line in cls_file:
+line = line.rstrip()
+count += 1
+if count % 2 == 0:
+reads = re.split("\s+", line)
+for element in reads:
+for key, value in cl_annotations_items:
+if clust in value:
+reads_annotations[element] = key
+else:
+clust = re.split("\s+", line)[0].split(">CL")[1]
+return reads_annotations
+def annot_profile(annotation_keys, part):
+''' Predefine dictionary of known annotations and partial sequence
+	repetitive profiles defined by parallel process '''
+subprofile = {}
+for key in annotation_keys:
+subprofile[key] = [np.zeros(part, dtype=int), np.zeros(part,
+dtype=int)]
+subprofile["ALL"] = [np.zeros(part, dtype=int), np.zeros(part, dtype=int)]
+return subprofile
+def parallel_process(WINDOW, OVERLAP, seq_length, annotation_keys,
+reads_annotations, subfasta, BLAST_DB, E_VALUE, WORD_SIZE,
+BLAST_TASK, MAX_ALIGNMENTS, BITSCORE, DUST_FILTER,
+last_index, subsets_num, subset_index):
+''' Run parallel function to process the input sequence in windows
+		Run blast for subsequence defined by the input index and window size
+		Create and increment subprofile vector based on reads aligned within window '''
+loc_start = subset_index + 1
+loc_end = subset_index + WINDOW
+if loc_end >= seq_length:
+loc_end = seq_length
+subprofile = annot_profile(annotation_keys, seq_length - loc_start + 1)
+else:
+subprofile = annot_profile(annotation_keys, WINDOW + 1)
+# Find HSP records for every window defined by query location and parse the tabular stdout:
+# 1. query, 2. database read, 3. alignment start, 4. alignment end, 5. bitscore
+p = subprocess.Popen(
+"blastn -query {} -query_loc {}-{} -db {} -evalue {} -word_size {} -dust {} -task {} -num_alignments {} -outfmt '6 qseqid sseqid qstart qend bitscore pident'".format(
+subfasta, loc_start, loc_end, BLAST_DB, E_VALUE, WORD_SIZE,
+DUST_FILTER, BLAST_TASK, MAX_ALIGNMENTS),
+stdout=subprocess.PIPE,
+shell=True)
+count_hits = 0
+for line in p.stdout:
+column = line.decode("utf-8").rstrip().split("\t")
+if float(column[4]) >= BITSCORE:
+count_hits += 1
+read = column[1]  # ID of individual aligned read
+if "reduce" in read:
+reads_representation = int(read.split("reduce")[-1])
+else:
+reads_representation = 1
+qstart = int(column[2])  # starting position of alignment
+qend = int(column[3])  # ending position of alignemnt
+if read in reads_annotations:
+annotation = reads_annotations[read]
+else:
+annotation = "ALL"
+subprofile[annotation][0][
+qstart - subset_index - 1:qend - subset_index] = subprofile[
+annotation][0][qstart - subset_index - 1:qend -
+subset_index] + reads_representation
+subprofile[annotation][1][qstart - subset_index - 1:qend -
+subset_index] = subprofile[annotation][
+1][qstart - subset_index - 1:qend -
+subset_index] + float(column[
+5]) * reads_representation
+subprofile["ALL"][0] = sum([item[0] for item in subprofile.values()])
+subprofile["ALL"][1] = sum([item[1] for item in subprofile.values()])
+for repeat in subprofile.keys():
+subprofile[repeat][1] = [int(round(quality / hits_num))
+if hits_num != 0 else quality
+for hits_num, quality in zip(subprofile[
+repeat][0], subprofile[repeat][1])]
+if subset_index == 0:
+if subsets_num == 1:
+subprf_name = subprofile_single(subprofile, subset_index)
+else:
+subprf_name = subprofile_first(subprofile, subset_index, WINDOW,
+OVERLAP)
+elif subset_index == last_index:
+subprf_name = subprofile_last(subprofile, subset_index, OVERLAP)
+else:
+subprf_name = subprofiles_middle(subprofile, subset_index, WINDOW,
+OVERLAP)
+return subprf_name
+def subprofile_single(subprofile, subset_index):
+subprofile['idx'] = list(range(1, len(subprofile["ALL"][0]) + 1))
+subprf_dict = NamedTemporaryFile(suffix='{}_.pickle'.format(subset_index),
+delete=False)
+with open(subprf_dict.name, 'wb') as handle:
+pickle.dump(subprofile, handle, protocol=pickle.HIGHEST_PROTOCOL)
+subprf_dict.close()
+return subprf_dict.name
+def subprofile_first(subprofile, subset_index, WINDOW, OVERLAP):
+for key in subprofile.keys():
+subprofile[key][0] = subprofile[key][0][0:-OVERLAP // 2 - 1]
+subprofile[key][1] = subprofile[key][1][0:-OVERLAP // 2 - 1]
+subprofile['idx'] = list(range(subset_index + 1, subset_index + WINDOW -
+OVERLAP // 2 + 1))
+subprf_dict = NamedTemporaryFile(suffix='{}_.pickle'.format(subset_index),
+delete=False)
+with open(subprf_dict.name, 'wb') as handle:
+pickle.dump(subprofile, handle, protocol=pickle.HIGHEST_PROTOCOL)
+subprf_dict.close()
+return subprf_dict.name
+def subprofiles_middle(subprofile, subset_index, WINDOW, OVERLAP):
+for key in subprofile.keys():
+subprofile[key][0] = subprofile[key][0][OVERLAP // 2:-OVERLAP // 2 - 1]
+subprofile[key][1] = subprofile[key][1][OVERLAP // 2:-OVERLAP // 2 - 1]
+subprofile['idx'] = list(range(subset_index + OVERLAP // 2 + 1,
+subset_index + WINDOW - OVERLAP // 2 + 1))
+subprf_dict = NamedTemporaryFile(suffix='{}_.pickle'.format(subset_index),
+delete=False)
+with open(subprf_dict.name, 'wb') as handle:
+pickle.dump(subprofile, handle, protocol=pickle.HIGHEST_PROTOCOL)
+subprf_dict.close()
+return subprf_dict.name
+def subprofile_last(subprofile, subset_index, OVERLAP):
+len_subprofile = len(subprofile['ALL'][0])
+for key in subprofile.keys():
+subprofile[key][0] = subprofile[key][0][OVERLAP // 2:]
+subprofile[key][1] = subprofile[key][1][OVERLAP // 2:]
+subprofile['idx'] = list(range(subset_index + OVERLAP // 2 + 1,
+subset_index + len_subprofile + 1))
+subprf_dict = NamedTemporaryFile(suffix='{}_.pickle'.format(subset_index),
+delete=False)
+with open(subprf_dict.name, 'wb') as handle:
+pickle.dump(subprofile, handle, protocol=pickle.HIGHEST_PROTOCOL)
+subprf_dict.close()
+return subprf_dict.name
+def concatenate_prof(subprofiles_all, files_dict, seq_id, HTML_DATA,
+wig_files):
+for subprofile in subprofiles_all:
+with open(subprofile, 'rb') as handle:
+individual_dict = pickle.load(handle)
+exclude = set(["idx"])
+for key in set(individual_dict.keys()).difference(exclude):
+if any(individual_dict[key][0]):
+indices = handle_zero_lines(individual_dict[key][0])
+if key not in files_dict.keys():
+prf_name = "{}/{}.wig".format(HTML_DATA, re.sub(
+'[\/\|]', '_', key))
+prf_qual_name = "{}/{}_qual.wig".format(
+HTML_DATA, re.sub('[\/\|]', '_', key))
+prf_file = open(prf_name, "w")
+prf_q_file = open(prf_qual_name, "w")
+prf_file.write("{}{}\n".format(
+configuration.HEADER_WIG, seq_id))
+prf_q_file.write("{}{}\n".format(
+configuration.HEADER_WIG, seq_id))
+for i in indices:
+prf_file.write("{}\t{}\n".format(individual_dict[
+'idx'][i], individual_dict[key][0][i]))
+prf_q_file.write("{}\t{}\n".format(individual_dict[
+'idx'][i], int(individual_dict[key][1][i])))
+files_dict[key] = [prf_name, [seq_id], prf_qual_name]
+wig_files.append(prf_file)
+wig_files.append(prf_q_file)
+prf_file.close()
+prf_q_file.close()
+else:
+prf_name = files_dict[key][0]
+prf_qual_name = files_dict[key][2]
+with open(prf_name, "a") as prf_file, open(
+prf_qual_name, "a") as prf_q_file:
+if seq_id not in files_dict[key][1]:
+prf_file.write("{}{}\n".format(
+configuration.HEADER_WIG, seq_id))
+prf_q_file.write("{}{}\n".format(
+configuration.HEADER_WIG, seq_id))
+files_dict[key][1].append(seq_id)
+for i in indices:
+prf_file.write("{}\t{}\n".format(
+individual_dict['idx'][i], individual_dict[
+key][0][i]))
+prf_q_file.write("{}\t{}\n".format(
+individual_dict['idx'][i], int(
+individual_dict[key][1][i])))
+return files_dict, wig_files
+def concatenate_prof_CN(CV, subprofiles_all, files_dict, seq_id, HTML_DATA,
+wig_files):
+for subprofile in subprofiles_all:
+with open(subprofile, 'rb') as handle:
+individual_dict = pickle.load(handle)
+exclude = set(["idx"])
+for key in set(individual_dict.keys()).difference(exclude):
+if any(individual_dict[key][0]):
+indices = handle_zero_lines(individual_dict[key][0])
+if key not in files_dict.keys():
+prf_name = "{}/{}.wig".format(HTML_DATA, re.sub(
+'[\/\|]', '_', key))
+prf_qual_name = "{}/{}_qual.wig".format(
+HTML_DATA, re.sub('[\/\|]', '_', key))
+prf_file = open(prf_name, "w")
+prf_q_file = open(prf_qual_name, "w")
+prf_file.write("{}{}\n".format(
+configuration.HEADER_WIG, seq_id))
+prf_q_file.write("{}{}\n".format(
+configuration.HEADER_WIG, seq_id))
+for i in indices:
+prf_file.write("{}\t{}\n".format(individual_dict[
+'idx'][i], int(individual_dict[key][0][i] /
+CV)))
+prf_q_file.write("{}\t{}\n".format(individual_dict[
+'idx'][i], int(individual_dict[key][1][i])))
+files_dict[key] = [prf_name, [seq_id], prf_qual_name]
+wig_files.append(prf_file)
+wig_files.append(prf_q_file)
+prf_file.close()
+prf_q_file.close()
+else:
+prf_name = files_dict[key][0]
+prf_qual_name = files_dict[key][2]
+with open(prf_name, "a") as prf_file, open(
+prf_qual_name, "a") as prf_q_file:
+if seq_id not in files_dict[key][1]:
+prf_file.write("{}{}\n".format(
+configuration.HEADER_WIG, seq_id))
+prf_q_file.write("{}{}\n".format(
+configuration.HEADER_WIG, seq_id))
+files_dict[key][1].append(seq_id)
+for i in indices:
+prf_file.write("{}\t{}\n".format(
+individual_dict['idx'][i], int(
+individual_dict[key][0][i] / CV)))
+prf_q_file.write("{}\t{}\n".format(
+individual_dict['idx'][i], int(
+individual_dict[key][1][i])))
+return files_dict, wig_files
+def handle_zero_lines(repeat_subhits):
+''' Clean lines which contains only zeros, i.e. positons which do not contain any hit. However border zero positions need to be preserved due to correct graphs plotting '''
+zero_idx = [idx for idx, val in enumerate(repeat_subhits) if val == 0]
+indices = [idx for idx, val in enumerate(repeat_subhits) if val != 0]
+zero_breakpoints = []
+for key, group in groupby(
+enumerate(zero_idx),
+lambda index_item: index_item[0] - index_item[1]):
+group = list(map(itemgetter(1), group))
+zero_breakpoints.append(group[0])
+zero_breakpoints.append(group[-1])
+if indices:
+indices.extend(zero_breakpoints)
+indices = sorted(set(indices), key=int)
+else:
+indices = []
+return indices
+def repeats_process_dom(OUTPUT_GFF, THRESHOLD, THRESHOLD_SEGMENT, HTML_DATA,
+xminimal, xmaximal, domains, seq_ids_dom, CN,
+seq_ids_all, seq_lengths_all, files_dict):
+''' Process the hits table separately for each fasta, create gff file and profile picture '''
+if files_dict:
+gff.create_gff(THRESHOLD, THRESHOLD_SEGMENT, OUTPUT_GFF, files_dict,
+seq_ids_all)
+else:
+with open(OUTPUT_GFF, "w") as gff_file:
+gff_file.write("{}\n".format(configuration.HEADER_GFF))
+# TODO remove plotting, keep only basic report
+return None
+seqs_all_part = seq_ids_all[0:configuration.MAX_PIC_NUM]
+graphs_dict = {}
+seqs_long = []
+if files_dict:
+[graphs_dict, seqs_long] = visualization.vis_profrep(
+seq_ids_all, files_dict, seq_lengths_all, CN, HTML_DATA,
+seqs_all_part)
+count_seq = 0
+for seq in seqs_all_part:
+if seq in graphs_dict.keys():
+fig = graphs_dict[seq][0]
+ax = graphs_dict[seq][1]
+art = []
+lgd = ax.legend(bbox_to_anchor=(0.5, -0.1), loc=9, ncol=3)
+art.append(lgd)
+if seq in seq_ids_dom:
+dom_idx = seq_ids_dom.index(seq)
+[fig, ax] = visualization.vis_domains(
+fig, ax, seq, xminimal[dom_idx], xmaximal[dom_idx],
+domains[dom_idx])
+elif seq in seqs_long:
+[fig, ax] = visualization.plot_figure(
+seq, seq_lengths_all[count_seq], CN)
+ax.text(
+0.3,
+0.5,
+"Graphs are only displayed if sequence is not longer than {} bp".format(
+configuration.SEQ_LEN_VIZ),
+transform=ax.transAxes,
+fontsize=14,
+verticalalignment='center',
+color='blue')
+else:
+[fig, ax] = visualization.plot_figure(
+seq, seq_lengths_all[count_seq], CN)
+ax.hlines(0, 0, seq_lengths_all[count_seq], color="red", lw=4)
+if seq in seq_ids_dom:
+dom_idx = seq_ids_dom.index(seq)
+[fig, ax] = visualization.vis_domains(
+fig, ax, seq, xminimal[dom_idx], xmaximal[dom_idx],
+domains[dom_idx])
+output_pic_png = "{}/{}.png".format(HTML_DATA, count_seq)
+fig.savefig(output_pic_png,
+bbox_inches="tight",
+format="png",
+dpi=configuration.IMAGE_RES)
+count_seq += 1
+return None
+def repeats_process(OUTPUT_GFF, THRESHOLD, THRESHOLD_SEGMENT, HTML_DATA, CN,
+seq_ids_all, seq_lengths_all, files_dict):
+''' Process the hits table separately for each fasta, create gff file and profile picture '''
+if files_dict:
+gff.create_gff(THRESHOLD, THRESHOLD_SEGMENT, OUTPUT_GFF, files_dict,
+seq_ids_all)
+else:
+with open(OUTPUT_GFF, "w") as gff_file:
+gff_file.write("{}\n".format(configuration.HEADER_GFF))
+# TODO remove plotting, keep only basic report
+return None
+seqs_all_part = seq_ids_all[0:configuration.MAX_PIC_NUM]
+graphs_dict = {}
+seqs_long = []
+if files_dict:
+[graphs_dict, seqs_long] = visualization.vis_profrep(
+seq_ids_all, files_dict, seq_lengths_all, CN, HTML_DATA,
+seqs_all_part)
+count_seq = 0
+for seq in seqs_all_part:
+if seq in graphs_dict.keys():
+fig = graphs_dict[seq][0]
+ax = graphs_dict[seq][1]
+art = []
+lgd = ax.legend(bbox_to_anchor=(0.5, -0.1), loc=9, ncol=3)
+art.append(lgd)
+elif seq in seqs_long:
+[fig, ax] = visualization.plot_figure(
+seq, seq_lengths_all[count_seq], CN)
+ax.text(
+0.3,
+0.5,
+"Graphs are only displayed if sequence is not longer than {} bp".format(
+configuration.SEQ_LEN_VIZ),
+transform=ax.transAxes,
+fontsize=14,
+verticalalignment='center',
+color='blue')
+else:
+[fig, ax] = visualization.plot_figure(
+seq, seq_lengths_all[count_seq], CN)
+ax.hlines(0, 0, seq_lengths_all[count_seq], color="red", lw=4)
+output_pic_png = "{}/{}.png".format(HTML_DATA, count_seq)
+fig.savefig(output_pic_png,
+bbox_inches="tight",
+format="png",
+dpi=configuration.IMAGE_RES)
+plt.close()
+count_seq += 1
+return None
+def html_output(total_length, seq_lengths_all, seq_names, HTML, DB_NAME, REF,
+REF_LINK):
+''' Define html output with limited number of output pictures and link to JBrowse '''
+info = "\t\t".join(['<pre> {} [{} bp]</pre>'.format(seq_name, seq_length)
+for seq_name, seq_length in zip(seq_names,
+seq_lengths_all)])
+if REF:
+ref_part_1 = REF.split("-")[0]
+ref_part_2 = "-".join(REF.split("-")[1:]).split(". ")[0]
+ref_part_3 = ". ".join("-".join(REF.split("-")[1:]).split(". ")[1:])
+ref_string = '''<h6> {} - <a href="{}" target="_blank" >{}</a>. {}'''.format(
+ref_part_1, REF_LINK, ref_part_2, ref_part_3)
+else:
+ref_string = "Custom Data"
+pictures = "\n\t\t".join(['<img src="{}.png" width=1800>'.format(
+pic) for pic in range(len(seq_names))[:configuration.MAX_PIC_NUM]])
+html_str = configuration.HTML_STR.format(info, total_length, DB_NAME,
+pictures, ref_string)
+with open(HTML, "w") as html_file:
+html_file.write(html_str)
+def adjust_tracklist(jbrowse_data_path):
+starting_lines = []
+ending_lines = []
+end = False
+with open(
+os.path.join(jbrowse_data_path,
+"trackList.json"), "r") as track_list:
+for line in track_list:
+if "]" not in line and not end:
+starting_lines.append(line)
+else:
+end = True
+ending_lines.append(line)
+with open(
+os.path.join(jbrowse_data_path,
+"trackList.json"), "w") as track_list:
+for line in starting_lines:
+track_list.write(line)
+return ending_lines
+def jbrowse_prep_dom(HTML_DATA, QUERY, OUT_DOMAIN_GFF, OUTPUT_GFF, N_GFF,
+total_length, JBROWSE_BIN, files_dict):
+''' Set up the paths, link and convert output data to be displayed as tracks in Jbrowse '''
+jbrowse_data_path = os.path.join(HTML_DATA, configuration.jbrowse_data_dir)
+with tempfile.TemporaryDirectory() as dirpath:
+subprocess.call(["{}/prepare-refseqs.pl".format(JBROWSE_BIN),
+"--fasta", QUERY, "--out", jbrowse_data_path])
+subprocess.call(["{}/flatfile-to-json.pl".format(
+JBROWSE_BIN), "--gff", OUT_DOMAIN_GFF, "--trackLabel",
+"GFF_domains", "--out", jbrowse_data_path])
+subprocess.call(["{}/flatfile-to-json.pl".format(
+JBROWSE_BIN), "--gff", OUTPUT_GFF, "--trackLabel", "GFF_repeats",
+"--config", configuration.JSON_CONF_R, "--out",
+jbrowse_data_path])
+subprocess.call(["{}/flatfile-to-json.pl".format(
+JBROWSE_BIN), "--gff", N_GFF, "--trackLabel", "N_regions",
+"--config", configuration.JSON_CONF_N, "--out",
+jbrowse_data_path])
+count = 0
+# Control the total length processed, if above threshold, dont create wig image tracks
+if files_dict:
+exclude = set(['ALL'])
+sorted_keys = sorted(set(files_dict.keys()).difference(exclude))
+sorted_keys.insert(0, "ALL")
+ending_lines = adjust_tracklist(jbrowse_data_path)
+track_list = open(
+os.path.join(jbrowse_data_path, "trackList.json"), "a")
+color_avail = len(configuration.COLORS_HEX)
+for repeat_id in sorted_keys:
+if count <= color_avail - 1:
+color = configuration.COLORS_HEX[count]
+else:
+r = lambda: random.randint(0, 255)
+color = '#%02X%02X%02X' % (r(), r(), r())
+count += 1
+bw_name = "{}.bw".format(re.sub('[\/\|]', '_', repeat_id))
+subprocess.call(["wigToBigWig", files_dict[repeat_id][
+0], os.path.join(HTML_DATA,
+configuration.CHROM_SIZES_FILE),
+os.path.join(jbrowse_data_path, bw_name)])
+track_list.write(configuration.TRACK_LIST.format(
+"{", bw_name, repeat_id, repeat_id, "{", color, "}", "}"))
+for line in ending_lines:
+track_list.write(line)
+distutils.dir_util.copy_tree(dirpath, jbrowse_data_path)
+return None
+def jbrowse_prep(HTML_DATA, QUERY, OUTPUT_GFF, N_GFF, total_length,
+JBROWSE_BIN, files_dict):
+''' Set up the paths, link and convert output data to be displayed as tracks in Jbrowse '''
+jbrowse_data_path = os.path.join(HTML_DATA, configuration.jbrowse_data_dir)
+with tempfile.TemporaryDirectory() as dirpath:
+subprocess.call(["{}/prepare-refseqs.pl".format(JBROWSE_BIN),
+"--fasta", QUERY, "--out", jbrowse_data_path])
+subprocess.call(["{}/flatfile-to-json.pl".format(
+JBROWSE_BIN), "--gff", OUTPUT_GFF, "--trackLabel", "GFF_repeats",
+"--config", configuration.JSON_CONF_R, "--out",
+jbrowse_data_path])
+subprocess.call(["{}/flatfile-to-json.pl".format(
+JBROWSE_BIN), "--gff", N_GFF, "--trackLabel", "N_regions",
+"--config", configuration.JSON_CONF_N, "--out",
+jbrowse_data_path])
+count = 0
+## Control the total length processed, if above threshold, dont create wig image tracks
+if files_dict:
+exclude = set(['ALL'])
+sorted_keys = sorted(set(files_dict.keys()).difference(exclude))
+sorted_keys.insert(0, "ALL")
+ending_lines = adjust_tracklist(jbrowse_data_path)
+track_list = open(
+os.path.join(jbrowse_data_path, "trackList.json"), "a")
+color_avail = len(configuration.COLORS_HEX)
+for repeat_id in sorted_keys:
+if count <= color_avail - 1:
+color = configuration.COLORS_HEX[count]
+else:
+r = lambda: random.randint(0, 255)
+color = '#%02X%02X%02X' % (r(), r(), r())
+count += 1
+bw_name = "{}.bw".format(re.sub('[\/\|]', '_', repeat_id))
+subprocess.call(["wigToBigWig", files_dict[repeat_id][
+0], os.path.join(HTML_DATA,
+configuration.CHROM_SIZES_FILE),
+os.path.join(jbrowse_data_path, bw_name)])
+track_list.write(configuration.TRACK_LIST.format(
+"{", bw_name, repeat_id, repeat_id, "{", color, "}", "}"))
+for line in ending_lines:
+track_list.write(line)
+track_list.close()
+distutils.dir_util.copy_tree(dirpath, jbrowse_data_path)
+return None
+def genome2coverage(GS, BLAST_DB):
+''' Convert genome size to coverage '''
+num_of_reads = 0
+with open(BLAST_DB, "r") as reads_all:
+first_line = reads_all.readline()
+if first_line.startswith(">"):
+num_of_reads += 1
+first_seq = reads_all.readline().rstrip()
+for line in reads_all:
+if line.startswith(">"):
+num_of_reads += 1
+len_of_read = len(first_seq)
+CV = (num_of_reads * len_of_read) / (GS * 1000000)  # GS in Mb
+return CV
+def prepared_data(DB_ID):
+''' Get prepared rep. annotation data from the table based on the selected species ID '''
+tbl = os.path.join(
+os.path.dirname(os.path.realpath(__file__)),
+configuration.PROFREP_DATA, configuration.PROFREP_TBL)
+with open(tbl, "r") as datasets:
+for line in datasets:
+if line.split("\t")[0] == DB_ID:
+DB_NAME = line.split("\t")[1]
+CV = float(line.split("\t")[5])
+REF = line.split("\t")[6]
+REF_LINK = line.split("\t")[7]
+return DB_NAME, CV, REF, REF_LINK
+def seq_sizes_file(seq_ids, seq_lengths_all, HTML_DATA):
+chrom_sizes = os.path.join(HTML_DATA, configuration.CHROM_SIZES_FILE)
+with open(chrom_sizes, "w") as chroms:
+for seq_id, seq_length in zip(seq_ids, seq_lengths_all):
+chroms.write("{}\t{}\n".format(seq_id, seq_length))
+def main(args):
+	## Command line arguments
+QUERY = args.query
+BLAST_DB = args.reads
+CL_ANNOTATION_TBL = args.ann_tbl
+CLS = args.cls
+BITSCORE = args.bit_score
+E_VALUE = args.e_value
+WORD_SIZE = args.word_size
+WINDOW = args.window
+OVERLAP = args.overlap
+BLAST_TASK = args.task
+MAX_ALIGNMENTS = args.max_alignments
+NEW_DB = args.new_db
+THRESHOLD = args.threshold_repeat
+THRESHOLD_SEGMENT = args.threshold_segment
+TH_IDENTITY = args.th_identity
+TH_LENGTH = args.th_length
+TH_INTERRUPT = args.interruptions
+TH_SIMILARITY = args.th_similarity
+TH_LEN_RATIO = args.max_len_proportion
+OUTPUT_GFF = args.output_gff
+DOMAINS = args.protein_domains
+LAST_DB = args.protein_database
+CLASSIFICATION = args.classification
+OUT_DOMAIN_GFF = args.domain_gff
+HTML = args.html_file
+HTML_DATA = args.html_path
+N_GFF = args.n_gff
+CN = args.copy_numbers
+GS = args.genome_size
+DB_ID = args.db_id
+THRESHOLD_SCORE = args.threshold_score
+WIN_DOM = args.win_dom
+OVERLAP_DOM = args.overlap_dom
+JBROWSE_BIN = args.jbrowse_bin
+DUST_FILTER = args.dust_filter
+LOG_FILE = args.log_file
+#JBROWSE_BIN = os.environ['JBROWSE_SOURCE_DIR']+"/bin"
+	#if not JBROWSE_BIN:
+	#	try:
+	#		JBROWSE_BIN = os.environ['JBROWSE_BIN']
+	#	except KeyError:
+	#		raise ValueError('There was no path to JBrowse bin found - set the enviroment variable JBROWSE_BIN or pass the argument explicitly')
+if CN and not DB_ID and not GS:
+raise ValueError("Genome size missing - if you want to convert hits to copy numbers please enter --genome_size parameter")
+## Check if there are forbidden characters in fasta IDs
+[forbidden_ids, headers] = check_fasta_id(QUERY)
+if forbidden_ids:
+##################### USER ERROR ###############################
+raise UserWarning(
+"The following IDs contain forbidden characters ('/' or '\\') - PLEASE REPLACE OR DELETE THEM:\n{}".format(
+"\n".join(forbidden_ids)))
+if len(headers) > len(set([header.split(" ")[0] for header in headers])):
+raise NameError(
+'''Sequences in multifasta format are not named correctly:
+				seq IDs(before the first space) are the same''')
+## Create new blast database of reads
+if NEW_DB:
+subprocess.call("makeblastdb -in {} -dbtype nucl".format(BLAST_DB),
+shell=True)
+## Parse prepared annotation data table
+if DB_ID:
+[DB_NAME, CV, REF, REF_LINK] = prepared_data(DB_ID)
+else:
+REF = None
+REF_LINK = None
+DB_NAME = "CUSTOM"
+## Create dir to store outputs for html and JBROWSE
+if not os.path.exists(HTML_DATA):
+os.makedirs(HTML_DATA)
+if not os.path.isabs(HTML):
+HTML = os.path.join(HTML_DATA, HTML)
+if not os.path.isabs(OUT_DOMAIN_GFF):
+OUT_DOMAIN_GFF = os.path.join(HTML_DATA, OUT_DOMAIN_GFF)
+if not os.path.isabs(LOG_FILE):
+LOG_FILE = os.path.join(HTML_DATA, LOG_FILE)
+if not os.path.isabs(N_GFF):
+N_GFF = os.path.join(HTML_DATA, N_GFF)
+if not os.path.isabs(OUTPUT_GFF):
+OUTPUT_GFF = os.path.join(HTML_DATA, OUTPUT_GFF)
+path = os.path.dirname(os.path.realpath(__file__))
+version_string = get_version(path)
+log = os.open(LOG_FILE, os.O_RDWR | os.O_CREAT)
+os.write(log, version_string.encode("utf-8"))
+## Define parameters for parallel process
+STEP = WINDOW - OVERLAP
+NUM_CORES = multiprocessing.cpu_count()
+os.write(log, "NUM_OF_CORES = {}\n".format(NUM_CORES).encode("utf-8"))
+## Convert genome size to coverage
+if CN and GS:
+CV = genome2coverage(GS, BLAST_DB)
+os.write(log, "COVERAGE = {}\n".format(CV).encode("utf-8"))
+parallel_pool = Pool(NUM_CORES)
+## Assign clusters to repetitive classes
+[cl_annotations_items, annotation_keys
+] = cluster_annotation(CL_ANNOTATION_TBL)
+## Assign reads to repetitive classes
+reads_annotations = read_annotation(CLS, cl_annotations_items)
+## Detect all fasta sequences from input
+fasta_list = multifasta(QUERY)
+headers = []
+files_dict = {}
+wig_files = []
+seq_count = 1
+start = 1
+total_length = 0
+seq_lengths_all = []
+Ngff = open(N_GFF, "w")
+Ngff.write("{}\n".format(configuration.HEADER_GFF))
+## Find hits for each fasta sequence separetely
+t_blast = time.time()
+for subfasta in fasta_list:
+[header, sequence] = fasta_read(subfasta)
+os.write(log, "Sequence {} is being processed...\n".format(
+header).encode("utf-8"))
+os.fsync(log)
+indices_N = [indices + 1
+for indices, n in enumerate(sequence)
+if n == "n" or n == "N"]
+if indices_N:
+gff.idx_ranges_N(indices_N, configuration.N_segment, header, Ngff,
+configuration.N_NAME, configuration.N_FEATURE)
+seq_length = len(sequence)
+headers.append(header)
+## Create parallel process
+subset_index = list(range(0, seq_length, STEP))
+## Situation when penultimal window is not complete but it is following by another one
+if len(subset_index) > 1 and subset_index[-2] + WINDOW >= seq_length:
+subset_index = subset_index[:-1]
+last_index = subset_index[-1]
+index_range = range(len(subset_index))
+for chunk_index in index_range[0::configuration.MAX_FILES_SUBPROFILES]:
+multiple_param = partial(
+parallel_process, WINDOW, OVERLAP, seq_length, annotation_keys,
+reads_annotations, subfasta, BLAST_DB, E_VALUE, WORD_SIZE,
+BLAST_TASK, MAX_ALIGNMENTS, BITSCORE, DUST_FILTER, last_index,
+len(subset_index))
+subprofiles_all = parallel_pool.map(multiple_param, subset_index[
+chunk_index:chunk_index + configuration.MAX_FILES_SUBPROFILES])
+## Join partial profiles to the final profile of the sequence
+if CN:
+[files_dict, wig_files
+] = concatenate_prof_CN(CV, subprofiles_all, files_dict,
+header, HTML_DATA, wig_files)
+else:
+[files_dict, wig_files] = concatenate_prof(
+subprofiles_all, files_dict, header, HTML_DATA, wig_files)
+for subprofile in subprofiles_all:
+os.unlink(subprofile)
+total_length += seq_length
+seq_lengths_all.append(seq_length)
+os.write(log, "ELAPSED_TIME_BLAST = {} s\n".format(time.time(
+) - t_blast).encode("utf-8"))
+os.write(
+log,
+"TOTAL_LENGHT_ANALYZED = {} bp\n".format(total_length).encode("utf-8"))
+## Close opened files
+for opened_file in wig_files:
+opened_file.close()
+Ngff.close()
+## Create file containing size of sequences to convert wig to bigwig
+seq_sizes_file(headers, seq_lengths_all, HTML_DATA)
+## Protein domains module
+t_domains = time.time()
+if DOMAINS:
+os.write(log, "Domains module has started...\n".encode("utf-8"))
+os.fsync(log)
+domains_primary = NamedTemporaryFile(delete=False)
+protein_domains.domain_search(QUERY, LAST_DB, CLASSIFICATION,
+domains_primary.name, THRESHOLD_SCORE,
+WIN_DOM, OVERLAP_DOM)
+domains_primary.close()
+[xminimal, xmaximal, domains, seq_ids_dom
+] = domains_filtering.filter_qual_dom(
+domains_primary.name, OUT_DOMAIN_GFF, TH_IDENTITY, TH_SIMILARITY,
+TH_LENGTH, TH_INTERRUPT, TH_LEN_RATIO, 'All', "")
+os.unlink(domains_primary.name)
+os.write(log, "ELAPSED_TIME_DOMAINS = {} s\n".format(time.time(
+) - t_domains).encode("utf-8"))
+# Process individual sequences from the input file sequentially
+t_gff_vis = time.time()
+repeats_process_dom(OUTPUT_GFF, THRESHOLD, THRESHOLD_SEGMENT,
+HTML_DATA, xminimal, xmaximal, domains,
+seq_ids_dom, CN, headers, seq_lengths_all,
+files_dict)
+os.write(log, "ELAPSED_TIME_GFF_VIS = {} s\n".format(time.time(
+) - t_gff_vis).encode("utf-8"))
+os.fsync(log)
+# Prepare data for html output
+t_jbrowse = time.time()
+os.write(log, "JBrowse tracks are being prepared...\n".encode("utf-8"))
+os.fsync(log)
+jbrowse_prep_dom(HTML_DATA, QUERY, OUT_DOMAIN_GFF, OUTPUT_GFF, N_GFF,
+total_length, JBROWSE_BIN, files_dict)
+os.write(log, "ELAPSED_TIME_JBROWSE_PREP = {} s\n".format(time.time(
+) - t_jbrowse).encode("utf-8"))
+else:
+# Process individual sequences from the input file sequentially
+t_gff_vis = time.time()
+repeats_process(OUTPUT_GFF, THRESHOLD, THRESHOLD_SEGMENT, HTML_DATA,
+CN, headers, seq_lengths_all, files_dict)
+os.write(log, "ELAPSED_TIME_GFF_VIS = {} s\n".format(time.time(
+) - t_gff_vis).encode("utf-8"))
+# Prepare data for html output
+t_jbrowse = time.time()
+jbrowse_prep(HTML_DATA, QUERY, OUTPUT_GFF, N_GFF, total_length,
+JBROWSE_BIN, files_dict)
+os.write(log, "ELAPSED_TIME_JBROWSE_PREP = {} s\n".format(time.time(
+) - t_jbrowse).encode("utf-8"))
+# Create HTML output
+t_html = time.time()
+os.write(
+log,
+"HTML output and JBrowse data structure are being prepared...\n".encode(
+"utf-8"))
+os.fsync(log)
+html_output(total_length, seq_lengths_all, headers, HTML, DB_NAME, REF,
+REF_LINK)
+os.write(log, "ELAPSED_TIME_HTML = {} s\n".format(time.time() -
+t_html).encode("utf-8"))
+os.write(log, "ELAPSED_TIME_PROFREP = {} s\n".format(time.time(
+) - t_profrep).encode("utf-8"))
+os.close(log)
+## Clean up the temporary fasta files
+for subfasta in fasta_list:
+os.unlink(subfasta)
+if __name__ == "__main__":
+import argparse
+from argparse import RawDescriptionHelpFormatter
+class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter,
+argparse.RawDescriptionHelpFormatter):
+pass
+# Default paths (command line usage)
+HTML = configuration.HTML
+DOMAINS_GFF = configuration.DOMAINS_GFF
+REPEATS_GFF = configuration.REPEATS_GFF
+N_GFF = configuration.N_GFF
+LOG_FILE = configuration.LOG_FILE
+PROFREP_OUTPUT_DIR = configuration.PROFREP_OUTPUT_DIR
+# Command line arguments
+parser = argparse.ArgumentParser(
+description='''
+	DEPENDENCIES:
+		- python 3.4 or higher with packages:
+			- numpy
+			- matplotlib
+			- biopython
+		- BLAST 2.2.28+ or higher
+		- LAST 744 or higher
+		- wigToBigWig
+		- cd-hit
+		- JBrowse - ! Only bin needed, does not have to be installed under a web server
+		* ProfRep Modules:
+			- gff.py
+			- visualization.py
+			- configuration.py
+			- protein_domains.py
+			- domains_filtering.py
+	EXAMPLE OF USAGE:
+		./protein.py --query PATH_TO_DNA_SEQ --reads PATH_TO_READS --ann_tbl PATH_TO_CLUSTERS_CLASSIFICATION  --cls PATH_TO_hitsort.cls [--new_db True]
+		''',
+epilog="""take a look at README for more detailed information""",
+formatter_class=CustomFormatter)
+Required = parser.add_argument_group('required arguments')
+altRequired = parser.add_argument_group(
+'alternative required arguments - prepared datasets')
+blastOpt = parser.add_argument_group('optional arguments - BLAST Search')
+parallelOpt = parser.add_argument_group(
+'optional arguments - Parallel Processing')
+protOpt = parser.add_argument_group('optional arguments - Protein Domains')
+outOpt = parser.add_argument_group('optional arguments - Output Paths')
+cnOpt = parser.add_argument_group(
+'optional arguments - Copy Numbers/Hits ')
+galaxyOpt = parser.add_argument_group(
+'optional arguments - Enviroment Variables')
+################ INPUTS ############################################
+Required.add_argument('-q',
+'--query',
+type=str,
+required=True,
+help='input DNA sequence in (multi)fasta format')
+Required.add_argument('-rdb',
+'--reads',
+type=str,
+required=True,
+help='blast database of all sequencing reads')
+Required.add_argument(
+'-a',
+'--ann_tbl',
+type=str,
+required=True,
+help=
+'clusters annotation table, tab-separated number of cluster and its classification')
+Required.add_argument(
+'-c',
+'--cls',
+type=str,
+required=True,
+help='cls file containing reads assigned to clusters (hitsort.cls)')
+altRequired.add_argument(
+'-id',
+'--db_id',
+type=str,
+help='annotation dataset ID (first column of datasets table)')
+################ BLAST parameters ##################################
+blastOpt.add_argument('-bs',
+'--bit_score',
+type=float,
+default=50,
+help='bitscore threshold')
+blastOpt.add_argument(
+'-m',
+'--max_alignments',
+type=int,
+default=10000000,
+help=
+'blast filtering option: maximal number of alignments in the output')
+blastOpt.add_argument('-e',
+'--e_value',
+type=str,
+default=0.1,
+help='blast setting option: e-value')
+blastOpt.add_argument(
+'-df',
+'--dust_filter',
+type=str,
+default="'20 64 1'",
+help='dust filters low-complexity regions during BLAST search')
+blastOpt.add_argument(
+'-ws',
+'--word_size',
+type=int,
+default=11,
+help='blast search option: initial word size for alignment')
+blastOpt.add_argument('-t',
+'--task',
+type=str,
+default="blastn",
+help='type of blast to be triggered')
+blastOpt.add_argument(
+'-n',
+'--new_db',
+type=str2bool,
+default=True,
+help=
+'create a new blast database, USE THIS OPTION IF YOU RUN PROFREP WITH NEW DATABASE FOR THE FIRST TIME')
+############### PARALLEL PROCESSING ARGUMENTS ######################
+parallelOpt.add_argument(
+'-w',
+'--window',
+type=int,
+default=5000,
+help='sliding window size for parallel processing')
+parallelOpt.add_argument(
+'-o',
+'--overlap',
+type=int,
+default=150,
+help=
+'overlap for parallely processed regions, set greater than a read size')
+################ PROTEIN DOMAINS PARAMETERS ########################
+protOpt.add_argument('-pd',
+'--protein_domains',
+type=str2bool,
+default=False,
+help='use module for protein domains')
+protOpt.add_argument('-pdb',
+'--protein_database',
+type=str,
+help='protein domains database')
+protOpt.add_argument('-cs',
+'--classification',
+type=str,
+help='protein domains classification file')
+protOpt.add_argument(
+'-wd',
+'--win_dom',
+type=int,
+default=10000000,
+help=
+'protein domains module: sliding window to process large input sequences sequentially')
+protOpt.add_argument(
+'-od',
+'--overlap_dom',
+type=int,
+default=10000,
+help=
+'protein domains module: overlap of sequences in two consecutive windows')
+protOpt.add_argument(
+'-thsc',
+'--threshold_score',
+type=int,
+default=80,
+help=
+'protein domains module: percentage of the best score within the cluster to  significant domains')
+protOpt.add_argument("-thl",
+"--th_length",
+type=float,
+choices=[Range(0.0, 1.0)],
+default=0.8,
+help="proportion of alignment length threshold")
+protOpt.add_argument("-thi",
+"--th_identity",
+type=float,
+choices=[Range(0.0, 1.0)],
+default=0.35,
+help="proportion of alignment identity threshold")
+protOpt.add_argument(
+"-ths",
+"--th_similarity",
+type=float,
+choices=[Range(0.0, 1.0)],
+default=0.45,
+help="threshold for alignment proportional similarity")
+protOpt.add_argument(
+"-ir",
+"--interruptions",
+type=int,
+default=3,
+help=
+"interruptions (frameshifts + stop codons) tolerance threshold per 100 AA")
+protOpt.add_argument(
+"-mlen",
+"--max_len_proportion",
+type=float,
+default=1.2,
+help=
+"maximal proportion of alignment length to the original length of protein domain from database")
+################ OUTPUTS ###########################################
+outOpt.add_argument('-lg',
+'--log_file',
+type=str,
+default=LOG_FILE,
+help='path to log file')
+outOpt.add_argument('-ouf',
+'--output_gff',
+type=str,
+default=REPEATS_GFF,
+help='path to output gff of repetitive regions')
+outOpt.add_argument('-oug',
+'--domain_gff',
+type=str,
+default=DOMAINS_GFF,
+help='path to output gff of protein domains')
+outOpt.add_argument('-oun',
+'--n_gff',
+type=str,
+default=N_GFF,
+help='path to output gff of N regions')
+outOpt.add_argument('-hf',
+'--html_file',
+type=str,
+default=HTML,
+help='path to output html file')
+outOpt.add_argument('-hp',
+'--html_path',
+type=str,
+default=PROFREP_OUTPUT_DIR,
+help='path to html extra files')
+################ HITS/COPY NUMBERS ####################################
+cnOpt.add_argument('-cn',
+'--copy_numbers',
+type=str2bool,
+default=False,
+help='convert hits to copy numbers')
+cnOpt.add_argument(
+'-gs',
+'--genome_size',
+type=float,
+help=
+'genome size is required when converting hits to copy numbers and you use custom data')
+cnOpt.add_argument(
+'-thr',
+'--threshold_repeat',
+type=int,
+default=3,
+help=
+'threshold for hits/copy numbers per position to be considered repetitive')
+cnOpt.add_argument(
+'-thsg',
+'--threshold_segment',
+type=int,
+default=80,
+help='threshold for the length of repetitive segment to be reported')
+################ JBrowse ##########################
+galaxyOpt.add_argument('-jb',
+'--jbrowse_bin',
+type=str,
+help='path to JBrowse bin directory')
+args = parser.parse_args()
+main(args)

Mercurial > repos > petr-novak > profrep

comparison profrep.py @ 0:a5f1638b73be draft