view create_ribosome_profile/bam_to_ribosome_profile.py @ 9:11e3691f9260 draft

Uploaded
author triasteran
date Mon, 13 Feb 2023 15:18:47 +0000
parents 49e8fcbaaa72
children
line wrap: on
line source

#python /home/data2/GWIPS_viz/python_and_other_scripts/Converting_to_ribosomeprofiles/bowtieOutputToRibosomeProfileBedfile_RNase1.py /home/data2/GWIPS_viz/Ribo_seq/Gao14_human/QTI_HEK293_AminoAcid_Starvation/QTI_HEK293_AminoAcid_Starvation.bam_sorted.bam 12 /home/data2/GWIPS_viz/Annotations_genomes_etc/Homo_sapiens_UCSC/UCSC/hg19/Sequence/WholeGenomeFasta/genome.fa


from Bio import  SeqIO
import pysam, os
from sys import argv


def run_weight_centered(all_reads):
	'''
	calulate asite positions using weight centered approach
	'''
	sequence = {}
		
	for read in all_reads :

		if read.qlen < 25 : continue 

		protect_nts = sorted(read.positions)

		for Asite in protect_nts[12:-12] :
			if Asite in sequence:
				sequence[Asite] += 1.0/len(protect_nts[12:-12])
			else:
				sequence[Asite] = 1.0/len(protect_nts[12:-12])

		return sequence


def run_offset(all_reads, offset):
	'''
	calculate a-site position using provided offset. 
	'''
	sequence = {}
		
	for read in all_reads :
		if read.qlen < 25 : continue

		protect_nts = sorted(read.positions)

		if not read.is_reverse:
			Asite = protect_nts[offset]
		else :
			Asite = protect_nts[-1 - offset]

		if Asite in sequence:
			sequence[Asite] += 1
		else:
			sequence[Asite] = 1
	
	return sequence


def create_chrom_sizes(fasta_path):
	'''
	create chrom.sizes file from fasta input
	'''

	chromSizesoutput = open(fasta_path + "_chrom.sizes","w")

	records = []
	record = False
	for line in open(fasta_path, 'r').readlines():
		if line[0] == '>':
			if record:
				records.append(record)
			record = [line.strip("\n").split(' ')[0][1:], 0]

		else:
			sequence = line.strip('\n')
			record[1] += len(sequence)
			
	for seq_record in records:
		output_line = '%s\t%i\n' % (seq_record[0], seq_record[1])
		chromSizesoutput.write(output_line)

	chromSizesoutput.close()


def main(bam_path, fasta_path, output, mode="offset", offset=0):
	'''
	create sorted bed file of a ribosome profile in two mode options
	offset - calculate a site with an inputted estimated distance from read end. Same applied to all read lengths 
	weight - weight centered approach is used (suitable for mnase digested reads)
	'''
	alignments	= pysam.Samfile(bam_path, 'rb') 

	with open(fasta_path) as in_seq_handle:
		seq_dict = SeqIO.to_dict(SeqIO.parse(in_seq_handle, "fasta"))
		seq_dict_keys =  sorted(seq_dict.keys())



	bed_path = str(bam_path) + ".bed"

	bedfile = open(bed_path, "w")

	for chrom in seq_dict_keys:		
		try:
			all_reads = alignments.fetch(chrom)
		except:
			print(f"No reads fetchable from provided bam file for chromosome {chrom}. Perhaps the fasta file is not what you aligned to")

		if mode == "offset":
			sequence = run_offset(all_reads, offset)
		
		elif mode == "weight":
			sequence = run_weight_centered(all_reads)

		for Asite in sorted(sequence):
			bedfile.write(f"{chrom}\t{Asite}\t{Asite + 1}\t{sequence[Asite]}\n")
			
	bedfile.close()

	command = "sort -k1,1 -k2,2n %s > %s"%(bed_path, output)
	os.system(command)


if __name__ == '__main__':
	bam_path = str(argv[1]) 
	offset = int(argv[2])
	fasta_source = str(argv[3])
	fasta_path = str(argv[4])
	builtin = str(argv[5])
	mode = argv[6]
	output = argv[7]

	if not os.path.exists(bam_path + ".bai"): 
		print(bam_path)
		pysam.index(bam_path)
	
	if fasta_source == 'history':
		main(bam_path = bam_path, fasta_path = fasta_path, output=output, mode=mode, offset=offset)
	else:
		main(bam_path = bam_path, fasta_path = builtin, output=output, mode=mode, offset=offset)