Mercurial > repos > triasteran > ribogalaxy_get_chrom_sizes_upd
comparison get_chrom_sizes/calculating_chrom.sizes.py @ 0:b93d6b2e561b draft
Uploaded
| author | triasteran |
|---|---|
| date | Mon, 13 Feb 2023 15:49:26 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:b93d6b2e561b |
|---|---|
| 1 # input a genome file and return a file genome.chrom.sizes to be associated with the custom build (or just have it as an output to be used later in the history. | |
| 2 # adapted from https://bioexpressblog.wordpress.com/2014/04/15/calculate-length-of-all-sequences-in-an-multi-fasta-file/ | |
| 3 from sys import argv | |
| 4 # python calculating_chrom.sizes.py genome_input.fa output.chrom.sizes | |
| 5 fasta_source = str(argv[1]) | |
| 6 prefix = str(argv[2]) | |
| 7 genome = str(argv[3]) | |
| 8 builtin = str(argv[4]) | |
| 9 output = str(argv[5]) | |
| 10 | |
| 11 # genome = 'test-data/test.fasta' | |
| 12 # output = "test-data/test_chrom.sizes" | |
| 13 if fasta_source == 'builtin': | |
| 14 genome = builtin | |
| 15 | |
| 16 chromSizesoutput = open(output,"w") | |
| 17 | |
| 18 records = [] | |
| 19 record = False | |
| 20 for line in open(genome, 'r').readlines(): | |
| 21 if line[0] == '>': | |
| 22 if record: | |
| 23 records.append(record) | |
| 24 record = [line.strip("\n").split(' ')[0].split(" ")[0][1:], 0] | |
| 25 | |
| 26 else: | |
| 27 sequence = line.strip('\n') | |
| 28 record[1] += len(sequence) | |
| 29 | |
| 30 if record not in records: | |
| 31 records.append(record) | |
| 32 | |
| 33 | |
| 34 | |
| 35 for seq_record in records: | |
| 36 if prefix != 'none': | |
| 37 output_line = f"{prefix}{seq_record[0]}\t{seq_record[1]}\n" | |
| 38 else: | |
| 39 output_line = f"{seq_record[0]}\t{seq_record[1]}\n" | |
| 40 | |
| 41 chromSizesoutput.write(output_line) | |
| 42 | |
| 43 chromSizesoutput.close() |
