annotate get_chrom_sizes/calculating_chrom.sizes.py @ 1:f2306942d61a draft default tip

Uploaded
author triasteran
date Tue, 14 Feb 2023 12:29:12 +0000
parents b93d6b2e561b
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
b93d6b2e561b Uploaded
triasteran
parents:
diff changeset
1 # input a genome file and return a file genome.chrom.sizes to be associated with the custom build (or just have it as an output to be used later in the history.
b93d6b2e561b Uploaded
triasteran
parents:
diff changeset
2 # adapted from https://bioexpressblog.wordpress.com/2014/04/15/calculate-length-of-all-sequences-in-an-multi-fasta-file/
b93d6b2e561b Uploaded
triasteran
parents:
diff changeset
3 from sys import argv
b93d6b2e561b Uploaded
triasteran
parents:
diff changeset
4 # python calculating_chrom.sizes.py genome_input.fa output.chrom.sizes
b93d6b2e561b Uploaded
triasteran
parents:
diff changeset
5 fasta_source = str(argv[1])
b93d6b2e561b Uploaded
triasteran
parents:
diff changeset
6 prefix = str(argv[2])
b93d6b2e561b Uploaded
triasteran
parents:
diff changeset
7 genome = str(argv[3])
b93d6b2e561b Uploaded
triasteran
parents:
diff changeset
8 builtin = str(argv[4])
b93d6b2e561b Uploaded
triasteran
parents:
diff changeset
9 output = str(argv[5])
b93d6b2e561b Uploaded
triasteran
parents:
diff changeset
10
b93d6b2e561b Uploaded
triasteran
parents:
diff changeset
11 # genome = 'test-data/test.fasta'
b93d6b2e561b Uploaded
triasteran
parents:
diff changeset
12 # output = "test-data/test_chrom.sizes"
b93d6b2e561b Uploaded
triasteran
parents:
diff changeset
13 if fasta_source == 'builtin':
b93d6b2e561b Uploaded
triasteran
parents:
diff changeset
14 genome = builtin
b93d6b2e561b Uploaded
triasteran
parents:
diff changeset
15
b93d6b2e561b Uploaded
triasteran
parents:
diff changeset
16 chromSizesoutput = open(output,"w")
b93d6b2e561b Uploaded
triasteran
parents:
diff changeset
17
b93d6b2e561b Uploaded
triasteran
parents:
diff changeset
18 records = []
b93d6b2e561b Uploaded
triasteran
parents:
diff changeset
19 record = False
b93d6b2e561b Uploaded
triasteran
parents:
diff changeset
20 for line in open(genome, 'r').readlines():
b93d6b2e561b Uploaded
triasteran
parents:
diff changeset
21 if line[0] == '>':
b93d6b2e561b Uploaded
triasteran
parents:
diff changeset
22 if record:
b93d6b2e561b Uploaded
triasteran
parents:
diff changeset
23 records.append(record)
b93d6b2e561b Uploaded
triasteran
parents:
diff changeset
24 record = [line.strip("\n").split(' ')[0].split(" ")[0][1:], 0]
b93d6b2e561b Uploaded
triasteran
parents:
diff changeset
25
b93d6b2e561b Uploaded
triasteran
parents:
diff changeset
26 else:
b93d6b2e561b Uploaded
triasteran
parents:
diff changeset
27 sequence = line.strip('\n')
b93d6b2e561b Uploaded
triasteran
parents:
diff changeset
28 record[1] += len(sequence)
b93d6b2e561b Uploaded
triasteran
parents:
diff changeset
29
b93d6b2e561b Uploaded
triasteran
parents:
diff changeset
30 if record not in records:
b93d6b2e561b Uploaded
triasteran
parents:
diff changeset
31 records.append(record)
b93d6b2e561b Uploaded
triasteran
parents:
diff changeset
32
b93d6b2e561b Uploaded
triasteran
parents:
diff changeset
33
b93d6b2e561b Uploaded
triasteran
parents:
diff changeset
34
b93d6b2e561b Uploaded
triasteran
parents:
diff changeset
35 for seq_record in records:
b93d6b2e561b Uploaded
triasteran
parents:
diff changeset
36 if prefix != 'none':
b93d6b2e561b Uploaded
triasteran
parents:
diff changeset
37 output_line = f"{prefix}{seq_record[0]}\t{seq_record[1]}\n"
b93d6b2e561b Uploaded
triasteran
parents:
diff changeset
38 else:
b93d6b2e561b Uploaded
triasteran
parents:
diff changeset
39 output_line = f"{seq_record[0]}\t{seq_record[1]}\n"
b93d6b2e561b Uploaded
triasteran
parents:
diff changeset
40
b93d6b2e561b Uploaded
triasteran
parents:
diff changeset
41 chromSizesoutput.write(output_line)
b93d6b2e561b Uploaded
triasteran
parents:
diff changeset
42
b93d6b2e561b Uploaded
triasteran
parents:
diff changeset
43 chromSizesoutput.close()