annotate get_chrom_sizes/calculating_chrom.sizes.py @ 4:c6a297d05c8e draft

Uploaded
author jackcurragh
date Fri, 13 May 2022 09:17:14 +0000
parents cfdf764b9226
children c96b29e00427
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
27f3669eda60 Uploaded
jackcurragh
parents:
diff changeset
1 # input a genome file and return a file genome.chrom.sizes to be associated with the custom build (or just have it as an output to be used later in the history.
27f3669eda60 Uploaded
jackcurragh
parents:
diff changeset
2 # adapted from https://bioexpressblog.wordpress.com/2014/04/15/calculate-length-of-all-sequences-in-an-multi-fasta-file/
27f3669eda60 Uploaded
jackcurragh
parents:
diff changeset
3 from sys import argv
27f3669eda60 Uploaded
jackcurragh
parents:
diff changeset
4 # python calculating_chrom.sizes.py genome_input.fa output.chrom.sizes
27f3669eda60 Uploaded
jackcurragh
parents:
diff changeset
5 genome = str(argv[1])
3
cfdf764b9226 Uploaded
jackcurragh
parents: 1
diff changeset
6 prefix = str(argv[2])
cfdf764b9226 Uploaded
jackcurragh
parents: 1
diff changeset
7 output = str(argv[3])
1
27f3669eda60 Uploaded
jackcurragh
parents:
diff changeset
8 # genome = 'test-data/test.fasta'
27f3669eda60 Uploaded
jackcurragh
parents:
diff changeset
9 # output = "test-data/test_chrom.sizes"
27f3669eda60 Uploaded
jackcurragh
parents:
diff changeset
10
27f3669eda60 Uploaded
jackcurragh
parents:
diff changeset
11 chromSizesoutput = open(output,"w")
27f3669eda60 Uploaded
jackcurragh
parents:
diff changeset
12
27f3669eda60 Uploaded
jackcurragh
parents:
diff changeset
13 records = []
27f3669eda60 Uploaded
jackcurragh
parents:
diff changeset
14 record = False
27f3669eda60 Uploaded
jackcurragh
parents:
diff changeset
15 for line in open(genome, 'r').readlines():
27f3669eda60 Uploaded
jackcurragh
parents:
diff changeset
16 if line[0] == '>':
27f3669eda60 Uploaded
jackcurragh
parents:
diff changeset
17 if record:
27f3669eda60 Uploaded
jackcurragh
parents:
diff changeset
18 records.append(record)
27f3669eda60 Uploaded
jackcurragh
parents:
diff changeset
19 record = [line.strip("\n").split(' ')[0][1:], 0]
27f3669eda60 Uploaded
jackcurragh
parents:
diff changeset
20
27f3669eda60 Uploaded
jackcurragh
parents:
diff changeset
21 else:
27f3669eda60 Uploaded
jackcurragh
parents:
diff changeset
22 sequence = line.strip('\n')
27f3669eda60 Uploaded
jackcurragh
parents:
diff changeset
23 record[1] += len(sequence)
3
cfdf764b9226 Uploaded
jackcurragh
parents: 1
diff changeset
24
cfdf764b9226 Uploaded
jackcurragh
parents: 1
diff changeset
25 if record not in records:
cfdf764b9226 Uploaded
jackcurragh
parents: 1
diff changeset
26 records.append(record)
cfdf764b9226 Uploaded
jackcurragh
parents: 1
diff changeset
27
1
27f3669eda60 Uploaded
jackcurragh
parents:
diff changeset
28 for seq_record in records:
4
c6a297d05c8e Uploaded
jackcurragh
parents: 3
diff changeset
29 if prefix != 'none':
c6a297d05c8e Uploaded
jackcurragh
parents: 3
diff changeset
30 output_line = f"{prefix}{seq_record[0]}\t{seq_record[1]}\n"
c6a297d05c8e Uploaded
jackcurragh
parents: 3
diff changeset
31 else:
c6a297d05c8e Uploaded
jackcurragh
parents: 3
diff changeset
32 output_line = f"{seq_record[0]}\t{seq_record[1]}\n"
c6a297d05c8e Uploaded
jackcurragh
parents: 3
diff changeset
33
1
27f3669eda60 Uploaded
jackcurragh
parents:
diff changeset
34 chromSizesoutput.write(output_line)
27f3669eda60 Uploaded
jackcurragh
parents:
diff changeset
35
27f3669eda60 Uploaded
jackcurragh
parents:
diff changeset
36 chromSizesoutput.close()