Mercurial > repos > jackcurragh > ribogalaxy_get_chrom_sizes
view get_chrom_sizes/calculating_chrom.sizes.py @ 8:84664985411c draft
Uploaded
author | jackcurragh |
---|---|
date | Tue, 17 May 2022 13:16:52 +0000 |
parents | c96b29e00427 |
children | c7e2879bf357 |
line wrap: on
line source
# input a genome file and return a file genome.chrom.sizes to be associated with the custom build (or just have it as an output to be used later in the history. # adapted from https://bioexpressblog.wordpress.com/2014/04/15/calculate-length-of-all-sequences-in-an-multi-fasta-file/ from sys import argv # python calculating_chrom.sizes.py genome_input.fa output.chrom.sizes fasta_source = str(argv[1]) genome = str(argv[2]) builtin = str(argv[3]) prefix = str(argv[4]) output = str(argv[5]) # genome = 'test-data/test.fasta' # output = "test-data/test_chrom.sizes" print(fasta_source, genome, builtin, prefix, output) if fasta_source == 'builtin': genome = builtin chromSizesoutput = open(output,"w") records = [] record = False for line in open(genome, 'r').readlines(): if line[0] == '>': if record: records.append(record) record = [line.strip("\n").split(' ')[0][1:], 0] else: sequence = line.strip('\n') record[1] += len(sequence) if record not in records: records.append(record) for seq_record in records: if prefix != 'none': output_line = f"{prefix}{seq_record[0]}\t{seq_record[1]}\n" else: output_line = f"{seq_record[0]}\t{seq_record[1]}\n" chromSizesoutput.write(output_line) chromSizesoutput.close()