diff get_chrom_sizes/calculating_chrom.sizes.py @ 0:b93d6b2e561b draft

Uploaded
author triasteran
date Mon, 13 Feb 2023 15:49:26 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/get_chrom_sizes/calculating_chrom.sizes.py	Mon Feb 13 15:49:26 2023 +0000
@@ -0,0 +1,43 @@
+# input a genome file and return a file genome.chrom.sizes to be associated with the custom build (or just have it as an output to be used later in the history.
+# adapted from https://bioexpressblog.wordpress.com/2014/04/15/calculate-length-of-all-sequences-in-an-multi-fasta-file/
+from sys import argv
+# python calculating_chrom.sizes.py genome_input.fa output.chrom.sizes
+fasta_source = str(argv[1])
+prefix = str(argv[2])
+genome = str(argv[3])
+builtin = str(argv[4])
+output = str(argv[5])
+
+# genome = 'test-data/test.fasta'
+# output = "test-data/test_chrom.sizes"
+if fasta_source == 'builtin':
+	genome = builtin
+
+chromSizesoutput = open(output,"w")
+
+records = []
+record = False
+for line in open(genome, 'r').readlines():
+	if line[0] == '>':
+		if record:
+			records.append(record)
+		record = [line.strip("\n").split(' ')[0].split(" ")[0][1:], 0]
+
+	else:
+		sequence = line.strip('\n')
+		record[1] += len(sequence)
+
+if record not in records:
+	records.append(record)
+
+
+
+for seq_record in records:
+	if prefix != 'none':
+		output_line = f"{prefix}{seq_record[0]}\t{seq_record[1]}\n"
+	else:
+		output_line = f"{seq_record[0]}\t{seq_record[1]}\n"
+
+	chromSizesoutput.write(output_line)
+
+chromSizesoutput.close()